In [113]:
import numpy as np
import pandas as pd
import ast

from collections import Counter

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report, accuracy_score
from sklearn.metrics import multilabel_confusion_matrix

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier

In [107]:
# read in data
df = pd.read_csv("data/final_data_for_svr.csv")

# drop columns
df = df.drop(columns=['Unnamed: 0.1', 'Unnamed: 0'])


# Topic Classification

In [108]:
# create a copy of the dataframe to work on
my_df = df.copy()

# keep only certain columns
my_df = my_df[['_id', 'duration', 'likes', 'title',
       'views', 'topics_list', 'processed_transcript', 'tb_score', 'vd_score']]
my_df['topics_list'] = my_df['topics_list'].apply(ast.literal_eval)

# print(len(my_df))

# drop NaN values
my_df = my_df.dropna()
# my_df = my_df[my_df['topics_list']!=[]]

# make sure there is at least one topic
my_df = my_df[my_df['topics_list'].map(lambda d: len(d)) > 0]
# print(len(my_df))

my_df.head()

Unnamed: 0,_id,duration,likes,title,views,topics_list,processed_transcript,tb_score,vd_score
0,21,992,17000,Meet the founder of the blog revolution,589115,"[culture, design, business, entertainment, sof...",past couple day preparing speech become nervou...,0.133855,0.9996
1,1,957,110000,Averting the climate crisis,3671801,"[climate change, culture, environment, global ...",thank much chris truly great honor opportunity...,0.131762,0.9997
2,7,1266,60000,Simplicity sells,2008487,"[technology, computers, entertainment, media, ...",music sound silence simon amp garfunkel hello ...,0.155493,0.9999
3,47,1126,80000,Chemical scum that dream of distant quasars,2694257,"[climate change, culture, environment, global ...",told go limb say something surprising try want...,0.066814,0.999
4,55,1524,14000,My wish: A global day of film,489757,"[culture, global issues, entertainment, social...",help wish think little kid friend ask genie co...,0.127882,0.9995


In [109]:
# find all topics in topic list and make a dictionary of their counts

topics = [thing for inner_list in my_df['topics_list'].tolist() for thing in inner_list]
count_topics = Counter(topics)
count_dict = dict(count_topics)

In [110]:
# choose the most popular topic as the topic for each talk
def extract_most_pop(listy_list):

    # print(listy_list)
    
    my_val = listy_list[0]

    for val in listy_list:
        if count_dict[val] > count_dict[my_val]:
            my_val = val

    return my_val

# for each row, pick most popular topic as its topic
my_df['topic'] = my_df['topics_list'].apply(extract_most_pop)

my_df.head()


Unnamed: 0,_id,duration,likes,title,views,topics_list,processed_transcript,tb_score,vd_score,topic
0,21,992,17000,Meet the founder of the blog revolution,589115,"[culture, design, business, entertainment, sof...",past couple day preparing speech become nervou...,0.133855,0.9996,culture
1,1,957,110000,Averting the climate crisis,3671801,"[climate change, culture, environment, global ...",thank much chris truly great honor opportunity...,0.131762,0.9997,science
2,7,1266,60000,Simplicity sells,2008487,"[technology, computers, entertainment, media, ...",music sound silence simon amp garfunkel hello ...,0.155493,0.9999,technology
3,47,1126,80000,Chemical scum that dream of distant quasars,2694257,"[climate change, culture, environment, global ...",told go limb say something surprising try want...,0.066814,0.999,science
4,55,1524,14000,My wish: A global day of film,489757,"[culture, global issues, entertainment, social...",help wish think little kid friend ask genie co...,0.127882,0.9995,culture


### Predict Topics from Numerical Values

In [111]:
# create training and testing data splits
class_df = my_df.copy()
class_df = class_df[['duration', 'likes', 'views', 'tb_score', 'vd_score', 'topic']]

X1 = class_df[['duration', 'likes', 'views', 'tb_score', 'vd_score']]
y1 = class_df[['topic']]

X_train1, X_test1, y_train1, y_test1 = train_test_split(X1, y1, test_size=0.2, random_state=42)

In [112]:
# fit the decision tree classifier
dtree = DecisionTreeClassifier(max_depth=5).fit(X_train1, y_train1)
dtree_predict = dtree.predict(X_test1)


# cm = confusion_matrix(y_test, dtree_predict)
# print(cm)

# print out metrics
print(accuracy_score(y_test1, dtree_predict))
print(classification_report(y_test1, dtree_predict))
print(multilabel_confusion_matrix(y_test1, dtree_predict, labels=dtree.classes_))

0.2560240963855422
                 precision    recall  f1-score   support

       Internet       0.00      0.00      0.00         1
    TED Fellows       0.00      0.00      0.00         5
         TED-Ed       0.38      0.70      0.49        99
           TEDx       0.00      0.00      0.00        61
       activism       0.00      0.00      0.00         3
      animation       0.00      0.00      0.00         3
   architecture       0.00      0.00      0.00         1
            art       0.00      0.00      0.00        10
        biology       0.00      0.00      0.00         2
          brain       0.00      0.00      0.00         3
       business       0.00      0.00      0.00        21
 climate change       0.00      0.00      0.00         6
  communication       0.00      0.00      0.00         9
      community       0.00      0.00      0.00         2
     creativity       0.00      0.00      0.00         8
        culture       0.38      0.08      0.14       108
           

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


### Predict Topics from Transcript

In [114]:
# create training and testing data
new_df = my_df.copy()
new_df = new_df[['processed_transcript', 'topic']]

X_train2, X_test2, y_train2, y_test2 = train_test_split(new_df["processed_transcript"], new_df["topic"], test_size=0.2, random_state=42)

vectorizer = CountVectorizer()
x_train_vec = vectorizer.fit_transform(X_train2)
x_test_vec = vectorizer.transform(X_test2)

In [115]:
# decision tree
dtree = DecisionTreeClassifier(max_depth=5).fit(x_train_vec, y_train2)
dtree_predict = dtree.predict(x_test_vec)

print("Decision Tree Classifier")
print(accuracy_score(y_test2, dtree_predict))

# naive bayes
nb_model = MultinomialNB().fit(x_train_vec, y_train2)
nb_predict = nb_model.predict(x_test_vec)

print("Multinomial Naive Bayes")
print(accuracy_score(y_test2, nb_predict))

# random forest
rf_model = RandomForestClassifier().fit(x_train_vec, y_train2)
rf_predict = rf_model.predict(x_test_vec)

print("Random Forest Classifier")
print(accuracy_score(y_test2, rf_predict))

Decision Tree Classifier
0.34236947791164657
Multinomial Naive Bayes
0.44879518072289154
Random Forest Classifier
0.4106425702811245


In [None]:
# https://towardsdatascience.com/multi-class-text-classification-with-scikit-learn-12f1e60e0a9f
# https://www.analyticsvidhya.com/blog/2021/11/a-guide-to-building-an-end-to-end-multiclass-text-classification-model/

# Emotion Classification

In [116]:
emote_df = df.copy()

emote_df = emote_df[['duration', 'likes', 'views', 'processed_transcript', 'Inspired', 'Moved', 'Sad', 'Curious',
       'Informed', 'Impressed', 'Hopeful', 'Amused', 'Fulfilled', 'Surprised', 'tb_score', 'vd_score']]
# emote_df.head()

# maybe would want to pick one first
emote_df['emotion'] = emote_df[['Inspired', 'Moved', 'Sad', 'Curious',
       'Informed', 'Impressed', 'Hopeful', 'Amused', 'Fulfilled', 'Surprised']].idxmax(axis=1)

emote_df.head()

Unnamed: 0,duration,likes,views,processed_transcript,Inspired,Moved,Sad,Curious,Informed,Impressed,Hopeful,Amused,Fulfilled,Surprised,tb_score,vd_score,emotion
0,992,17000,589115,past couple day preparing speech become nervou...,0.9,0.8,0.7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.133855,0.9996,Inspired
1,957,110000,3671801,thank much chris truly great honor opportunity...,0.85,0.6,0.0,0.75,0.0,0.0,0.0,0.0,0.0,0.0,0.131762,0.9997,Inspired
2,1266,60000,2008487,music sound silence simon amp garfunkel hello ...,0.0,0.0,0.0,0.9,0.8,0.6,0.0,0.0,0.0,0.0,0.155493,0.9999,Curious
3,1126,80000,2694257,told go limb say something surprising try want...,0.9,0.7,0.0,0.0,0.65,0.0,0.0,0.0,0.0,0.0,0.066814,0.999,Inspired
4,1524,14000,489757,help wish think little kid friend ask genie co...,0.85,0.9,0.0,0.75,0.0,0.0,0.0,0.0,0.0,0.0,0.127882,0.9995,Moved


### Predict Emotion from Numerical Values

In [118]:
# create the training and testing datasets
new_emote_df = emote_df.copy()
new_emote_df = new_emote_df[['duration', 'likes', 'views', 'tb_score', 'vd_score', 'emotion']]

X3 = new_emote_df[['duration', 'likes', 'views', 'tb_score', 'vd_score']]
y3 = new_emote_df[['emotion']]

X_train3, X_test3, y_train3, y_test3 = train_test_split(X3, y3, test_size=0.2, random_state=42)

# classify using a Decision Tree Classifier
dtree = DecisionTreeClassifier(max_depth=5).fit(X_train3, y_train3)
dtree_predict = dtree.predict(X_test3)

# print out metrics
cm = confusion_matrix(y_test3, dtree_predict)
print(cm)
print(accuracy_score(y_test3, dtree_predict))
print(classification_report(y_test3, dtree_predict))
print(multilabel_confusion_matrix(y_test3, dtree_predict, labels=dtree.classes_))

[[  0   0   0   0   0   5   0   0]
 [  0   0   0   1   1  87   1   0]
 [  0   0   0   0   0  18   0   0]
 [  0   0   0   0   0  18   0   0]
 [  0   0   0   0   1  48   0   0]
 [  0   3   0   2   1 876   3   0]
 [  0   1   0   1   0  56   1   0]
 [  0   0   0   0   1   2   0   0]]
0.7790594498669032
              precision    recall  f1-score   support

      Amused       0.00      0.00      0.00         5
     Curious       0.00      0.00      0.00        90
     Hopeful       0.00      0.00      0.00        18
   Impressed       0.00      0.00      0.00        18
    Informed       0.25      0.02      0.04        49
    Inspired       0.79      0.99      0.88       885
       Moved       0.20      0.02      0.03        59
         Sad       0.00      0.00      0.00         3

    accuracy                           0.78      1127
   macro avg       0.15      0.13      0.12      1127
weighted avg       0.64      0.78      0.69      1127

[[[1122    0]
  [   5    0]]

 [[1033    4]
  [  

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


### Predict Emotion using Transcript

In [121]:
# create training and testing datasets
transcript_emote_df = emote_df.copy()
transcript_emote_df = transcript_emote_df[['processed_transcript', 'emotion']]
transcript_emote_df = transcript_emote_df.dropna()

X_train4, X_test4, y_train4, y_test4 = train_test_split(transcript_emote_df["processed_transcript"], transcript_emote_df["emotion"], test_size=0.2, random_state=42)

vectorizer4 = CountVectorizer()
x_train_vec4 = vectorizer4.fit_transform(X_train4)
x_test_vec4 = vectorizer4.transform(X_test4)

In [123]:
# decision tree
dtree4 = DecisionTreeClassifier(max_depth=5).fit(x_train_vec4, y_train4)
dtree_predict4 = dtree4.predict(x_test_vec4)

print("Decision Tree Classifier")
print(accuracy_score(y_test4, dtree_predict4))

# naive bayes
nb_model4 = MultinomialNB().fit(x_train_vec4, y_train4)
nb_predict4 = nb_model4.predict(x_test_vec4)

print("Multinomial Naive Bayes")
print(accuracy_score(y_test4, nb_predict4))

# random forest
rf_model4 = RandomForestClassifier().fit(x_train_vec4, y_train4)
rf_predict4 = rf_model4.predict(x_test_vec4)

print("Random Forest Classifier")
print(accuracy_score(y_test4, rf_predict4))

Decision Tree Classifier
0.7502507522567703
Multinomial Naive Bayes
0.7311935807422267
Random Forest Classifier
0.7552657973921765


In [124]:
emote_df['emotion'].value_counts()

emotion
Inspired     4465
Curious       451
Moved         289
Informed      189
Hopeful       124
Impressed      65
Amused         35
Sad             8
Surprised       4
Fulfilled       1
Name: count, dtype: int64