In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import (
    classification_report, 
    accuracy_score, 
    precision_score, 
    recall_score, 
    f1_score, 
    confusion_matrix, 
    ConfusionMatrixDisplay
)
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier


In [None]:
data = pd.read_csv("tiktok_dataset.csv")
data = data.dropna(axis=0)

data['text_length'] = data['video_transcription_text'].str.len()

sns.histplot(data=data, stat="count", multiple="dodge", x="text_length", kde=False, palette="pastel", hue="claim_status", element="bars", legend=True)
plt.xlabel("video_transcription_text length (number of characters)")
plt.ylabel("Count")
plt.title("Distribution of video_transcription_text length for claims and opinions")
plt.show()

X = data.copy()
X = X.drop(['#', 'video_id'], axis=1)
X['claim_status'] = X['claim_status'].replace({'opinion': 0, 'claim': 1})
X = pd.get_dummies(X, columns=['verified_status', 'author_ban_status'], drop_first=True)

y = X['claim_status']
X = X.drop(['claim_status'], axis=1)
X_tr, X_test, y_tr, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
X_train, X_val, y_train, y_val = train_test_split(X_tr, y_tr, test_size=0.25, random_state=0)

count_vec = CountVectorizer(ngram_range=(2, 3), max_features=15, stop_words='english')
count_data = count_vec.fit_transform(X_train['video_transcription_text']).toarray()
count_df = pd.DataFrame(data=count_data, columns=count_vec.get_feature_names_out())
X_train_final = pd.concat([X_train.drop(columns=['video_transcription_text']).reset_index(drop=True), count_df], axis=1)

validation_count_data = count_vec.transform(X_val['video_transcription_text']).toarray()
validation_count_df = pd.DataFrame(data=validation_count_data, columns=count_vec.get_feature_names_out())
X_val_final = pd.concat([X_val.drop(columns=['video_transcription_text']).reset_index(drop=True), validation_count_df], axis=1)

test_count_data = count_vec.transform(X_test['video_transcription_text']).toarray()
test_count_df = pd.DataFrame(data=test_count_data, columns=count_vec.get_feature_names_out())
X_test_final = pd.concat([X_test.drop(columns=['video_transcription_text']).reset_index(drop=True), test_count_df], axis=1)

rf = RandomForestClassifier(random_state=0)
cv_params = {'max_depth': [5, 7, None],
             'max_features': [0.3, 0.6],
             'max_samples': [0.7],
             'min_samples_leaf': [1,2],
             'min_samples_split': [2,3],
             'n_estimators': [75,100,200]}
scoring = {'accuracy', 'precision', 'recall', 'f1'}
rf_cv = GridSearchCV(rf, cv_params, scoring=scoring, cv=5, refit='recall')
rf_cv.fit(X_train_final, y_train)
rfcv_s, rfcv_p = rf_cv.best_score_, rf_cv.best_params_

xgb = XGBClassifier(objective='binary:logistic', random_state=0)
cv_params = {'max_depth': [4,8,12],
             'min_child_weight': [3, 5],
             'learning_rate': [0.01, 0.1],
             'n_estimators': [300, 500]}
scoring = {'accuracy', 'precision', 'recall', 'f1'}
xgb_cv = GridSearchCV(xgb, cv_params, scoring=scoring, cv=5, refit='recall')
xgb_cv.fit(X_train_final, y_train)
xgcv_s, xgcv_p = xgb_cv.best_score_, xgb_cv.best_params_

y_pred_rf = rf_cv.best_estimator_.predict(X_val_final)
log_cm = confusion_matrix(y_val, y_pred_rf)
log_disp = ConfusionMatrixDisplay(confusion_matrix=log_cm, display_labels=None)
log_disp.plot()
plt.title('Random Forest - validation set'); plt.show()
target_labels = ['opinion', 'claim']
rf_report = classification_report(y_val, y_pred_rf, target_names=target_labels)

y_pred_xg = xgb_cv.best_estimator_.predict(X_val_final)
log_cm = confusion_matrix(y_val, y_pred_xg)
log_disp = ConfusionMatrixDisplay(confusion_matrix=log_cm, display_labels=None)
log_disp.plot()
plt.title('XGBoost - validation set'); plt.show()
xg_report = classification_report(y_val, y_pred_xg, target_names=target_labels)

y_pred = rf_cv.best_estimator_.predict(X_test_final)
log_cm = confusion_matrix(y_test, y_pred)
log_disp = ConfusionMatrixDisplay(confusion_matrix=log_cm, display_labels=None)
log_disp.plot()
plt.title('Random forest - test set'); plt.show()

importances = rf_cv.best_estimator_.feature_importances_
rf_importances_sr = pd.Series(importances, index=X_test_final.columns)
fig, ax = plt.subplots()
rf_importances_sr.plot.bar(ax=ax)
ax.set_title('Feature importances')
ax.set_ylabel('Mean decrease in impurity')
fig.tight_layout()





# eof