# <center>Using "More preprocessed" Data to run models on "All Speakers" and "Main Characters"</center>

- More preprocessed involves lemmatization and breaking down contractions
- Main characters (4) : Cartman, Butters, Kyle, Stan (excluded Randy)

In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.svm import LinearSVC
from sklearn import metrics
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

In [2]:
#semi pre-cleaned dataset
df = pd.read_csv('cleaned_all-seasons.csv')

#creating new colummn is_cartman
df['is_cartman'] = 0
df.loc[df.Character == 'Cartman', 'is_cartman'] = 1

In [3]:
#Removing punctuation
import re, string
from nltk.stem import WordNetLemmatizer
from nltk import word_tokenize

corpus = df.Line.tolist()

for line in range(len(corpus)):
    corpus[line] = re.sub('\\n', '', corpus[line].rstrip()).lower()
    corpus[line] = " ".join(word.strip(string.punctuation) for word in corpus[line].split())
    

In [5]:
#lemmatizing
lem = WordNetLemmatizer()

def lemmatize_lines(line):
    word_list = word_tokenize(line)
    word_list = [lem.lemmatize(w, pos='v') for w in word_list]
    lem_line = ' '.join([lem.lemmatize(w) for w in word_list])
    
    return lem_line

In [None]:
for line in range(len(corpus)):
    corpus[line] = lemmatize_lines(corpus[line])
    
#corpus[6:12]

In [4]:
#additional stopwords - found from a kernal
sw = ['be', 'you', 'i', 'to', 'the', 'do', 'it',\
        'a', 'we', 'that', 'and', 'have', 'go', 'what',\
        'get', 'of', 'this', 'in', 'on', 'all', 'just',\
        'for', 'he', 'know', 'will', 'but', 'with', 'so',\
        'they', 'now', 'well', "'s", 'guy', 'u', 'come',\
        'like', 'there', 'at', 'would', 'who', 'him',\
        'them', 'his', 'thing', 'where', 'should', 'an',\
        'please', 'maybe', 'their', 'even', 'any', 'than']

**<center>Using All Speakers</center>**

In [5]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn import metrics


from sklearn.tree import DecisionTreeClassifier
from sklearn import tree


from sklearn.linear_model import LogisticRegression


In [8]:
tf_vect = TfidfVectorizer(stop_words=sw, ngram_range=(1,1))

X = tf_vect.fit_transform(corpus)
y = df.is_cartman

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=11)

**<u> Naive Bayes </u>**

In [9]:
mnb = MultinomialNB()

cv_scores = cross_val_score(mnb, X_train, y_train, cv=10)
params = {'alpha': [1.0, 1.25, 1.35, 1.4]}
nb_grid = GridSearchCV(mnb, params, cv=5) #using grid search for hyperparameter tuning

nb_grid.fit(X_train, y_train)

#training
print(classification_report(y_train, nb_grid.predict(X_train), np.unique(y)))

              precision    recall  f1-score   support

           0       0.86      1.00      0.93     42785
           1       0.92      0.01      0.03      6842

    accuracy                           0.86     49627
   macro avg       0.89      0.51      0.48     49627
weighted avg       0.87      0.86      0.80     49627



In [11]:
#tesing 
print(classification_report(y_test, nb_grid.predict(X_test),np.unique(y))) 
y_pred = nb_grid.predict(X_test)
print("Recall:",metrics.recall_score(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.86      1.00      0.93     18337
           1       0.50      0.01      0.01      2932

    accuracy                           0.86     21269
   macro avg       0.68      0.50      0.47     21269
weighted avg       0.81      0.86      0.80     21269

Recall: 0.007162346521145975


**<u> Decision Tree </u>**

In [13]:
dtree = tree.DecisionTreeClassifier(#max_depth=6,
    class_weight="balanced",
    min_weight_fraction_leaf=0.03)
dtree = dtree.fit(X_train,y_train)

#training
print(classification_report(y_train, dtree.predict(X_train), np.unique(y)))

              precision    recall  f1-score   support

           0       0.88      0.78      0.83     42785
           1       0.20      0.35      0.26      6842

    accuracy                           0.72     49627
   macro avg       0.54      0.57      0.54     49627
weighted avg       0.79      0.72      0.75     49627



In [14]:
#testing
print(classification_report(y_test, dtree.predict(X_test),np.unique(y))) #testing
y_pred = dtree.predict(X_test)
print("Recall:",metrics.recall_score(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.88      0.78      0.83     18337
           1       0.19      0.33      0.25      2932

    accuracy                           0.72     21269
   macro avg       0.54      0.56      0.54     21269
weighted avg       0.79      0.72      0.75     21269

Recall: 0.33321964529331516


**<u> Random Forest </u>**

In [33]:
rf = RandomForestClassifier(n_estimators=400,
                            max_features='sqrt',
                            class_weight='balanced',
                            max_depth=5,
                            oob_score=True,
                            random_state=3,
                            n_jobs=-1)

rf.fit(X_train, y_train)

#training
print(classification_report(y_train, rf.predict(X_train), np.unique(y)))

              precision    recall  f1-score   support

           0       0.76      0.78      0.77     12166
           1       0.60      0.57      0.58      6842

    accuracy                           0.71     19008
   macro avg       0.68      0.68      0.68     19008
weighted avg       0.70      0.71      0.70     19008



In [16]:
#testing
print(classification_report(y_test, rf.predict(X_test), np.unique(y))) 
y_pred = rf.predict(X_test)
print("Recall:",metrics.recall_score(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.91      0.81      0.86     18337
           1       0.29      0.48      0.36      2932

    accuracy                           0.76     21269
   macro avg       0.60      0.64      0.61     21269
weighted avg       0.82      0.76      0.79     21269

Recall: 0.4771487039563438


**<u> SVM - Balanced </u>** 

In [34]:
svm = LinearSVC(C=0.05, max_iter=5000, class_weight='balanced', random_state=3)
svm.fit(X_train,y_train)
#training
print(classification_report(y_train, svm.predict(X_train), np.unique(y))) 

              precision    recall  f1-score   support

           0       0.84      0.75      0.80     12166
           1       0.63      0.75      0.69      6842

    accuracy                           0.75     19008
   macro avg       0.74      0.75      0.74     19008
weighted avg       0.77      0.75      0.76     19008



In [35]:
#testing
print(classification_report(y_test, svm.predict(X_test), np.unique(y))) 
y_pred = svm.predict(X_test)
print("Recall:",metrics.recall_score(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.77      0.70      0.73      5215
           1       0.54      0.63      0.58      2932

    accuracy                           0.67      8147
   macro avg       0.65      0.66      0.66      8147
weighted avg       0.69      0.67      0.68      8147

Recall: 0.6275579809004093


**<u> SVM - Unbalanced </u>** 

In [19]:
svm = LinearSVC(C=0.05, max_iter=5000, class_weight={0: 0.6, 1: 1.5}, random_state=3)
svm.fit(X_train,y_train)
#training
print(classification_report(y_train, svm.predict(X_train), np.unique(y))) 

              precision    recall  f1-score   support

           0       0.89      0.97      0.93     42785
           1       0.57      0.24      0.34      6842

    accuracy                           0.87     49627
   macro avg       0.73      0.61      0.64     49627
weighted avg       0.85      0.87      0.85     49627



In [20]:
#testing
print(classification_report(y_test, svm.predict(X_test), np.unique(y))) 
y_pred = svm.predict(X_test)
print("Recall:",metrics.recall_score(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.88      0.97      0.92     18337
           1       0.49      0.19      0.27      2932

    accuracy                           0.86     21269
   macro avg       0.68      0.58      0.60     21269
weighted avg       0.83      0.86      0.83     21269

Recall: 0.18894952251023192


**<u> Logistic Regression </u>** 

In [21]:
logis = LogisticRegression(class_weight = "balanced") 
logis.fit(X_train, y_train)

#training
print(classification_report(y_train, logis.predict(X_train), np.unique(y))) 



              precision    recall  f1-score   support

           0       0.96      0.80      0.87     42785
           1       0.39      0.78      0.52      6842

    accuracy                           0.80     49627
   macro avg       0.67      0.79      0.69     49627
weighted avg       0.88      0.80      0.82     49627



In [22]:
#testing
print(classification_report(y_test, logis.predict(X_test), np.unique(y))) 
y_pred = logis.predict(X_test)
print("Recall:",metrics.recall_score(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.92      0.78      0.85     18337
           1       0.29      0.56      0.39      2932

    accuracy                           0.75     21269
   macro avg       0.61      0.67      0.62     21269
weighted avg       0.83      0.75      0.78     21269

Recall: 0.5648021828103683


### <center>Ensemble Method - All Characters </center>

In [6]:
from sklearn.ensemble import VotingClassifier
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.metrics import confusion_matrix, precision_score, precision_recall_curve, recall_score, f1_score

In [49]:
ensemble = VotingClassifier(estimators=[('LR',logis),('SVM', svm)], voting='hard',weights=[1,0.5])
ensemble_fit= ensemble.fit(X, y)

In [50]:
ensemble_train_preds = ensemble.predict(X_train) #get predicted outputs of the classifier
ensemble_train_f1 = f1_score(y_train, ensemble_train_preds, average='micro')
ensemble_test_preds = ensemble.predict(X_test)
ensemble_test_f1 = f1_score(y_test, ensemble_test_preds, average='micro')
print("\nTrain/test F1 for Ensemble: ", ensemble_train_f1, ensemble_test_f1)

print("Recall:",metrics.recall_score(y_test, ensemble_test_preds))


Train/test F1 for Ensemble:  0.7731481481481481 0.7698539339634222
Recall: 0.7598908594815825


# **<center>* * *</center>**

### **<center>Using Main Characters</center>**

In [7]:
top_speakers = df.groupby(['Character']).size().loc[df.groupby(['Character']).size() > 2500] 
top_speakers
print(top_speakers.index.values)
df1 = df.loc[df['Character'].isin(top_speakers.index.values)]

['Butters' 'Cartman' 'Kyle' 'Stan']


In [8]:
#Removing punctuation
corpus = df1.Line.tolist()

for line in range(len(corpus)):
    corpus[line] = re.sub('\\n', '', corpus[line].rstrip()).lower()
    corpus[line] = " ".join(word.strip(string.punctuation) for word in corpus[line].split())

In [9]:
#lemmatizing
lem = WordNetLemmatizer()

def lemmatize_lines(line):
    word_list = word_tokenize(line)
    word_list = [lem.lemmatize(w, pos='v') for w in word_list]
    lem_line = ' '.join([lem.lemmatize(w) for w in word_list])
    
    return lem_line

In [10]:
for line in range(len(corpus)):
    corpus[line] = lemmatize_lines(corpus[line])

In [11]:
tf_vect = TfidfVectorizer(stop_words=sw, ngram_range=(1,1))

X = tf_vect.fit_transform(corpus)
y = df1.is_cartman

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=11)

**<u> Naive Bayes </u>**

In [12]:
mnb = MultinomialNB()

cv_scores = cross_val_score(mnb, X_train, y_train, cv=10)
params = {'alpha': [1.0, 1.25, 1.35, 1.4]}
nb_grid = GridSearchCV(mnb, params, cv=5) #using grid search for hyperparameter tuning

nb_grid.fit(X_train, y_train)

#training
print(classification_report(y_train, nb_grid.predict(X_train), np.unique(y)))

              precision    recall  f1-score   support

           0       0.73      0.99      0.84     12166
           1       0.94      0.33      0.49      6842

    accuracy                           0.75     19008
   macro avg       0.83      0.66      0.66     19008
weighted avg       0.80      0.75      0.71     19008



In [13]:
#tesing 
print(classification_report(y_test, nb_grid.predict(X_test),np.unique(y))) 
y_pred = nb_grid.predict(X_test)
print("Recall:",metrics.recall_score(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.68      0.96      0.79      5215
           1       0.71      0.19      0.30      2932

    accuracy                           0.68      8147
   macro avg       0.69      0.57      0.55      8147
weighted avg       0.69      0.68      0.62      8147

Recall: 0.18963165075034105


**<u> Decision Tree </u>**

In [14]:
dtree = tree.DecisionTreeClassifier(#max_depth=6,
    class_weight="balanced",
    min_weight_fraction_leaf=0.03)
dtree = dtree.fit(X_train,y_train)

#training
print(classification_report(y_train, dtree.predict(X_train), np.unique(y)))

              precision    recall  f1-score   support

           0       0.69      0.79      0.74     12166
           1       0.49      0.36      0.41      6842

    accuracy                           0.63     19008
   macro avg       0.59      0.57      0.57     19008
weighted avg       0.62      0.63      0.62     19008



In [15]:
#testing
print(classification_report(y_test, dtree.predict(X_test),np.unique(y))) #testing
y_pred = dtree.predict(X_test)
print("Recall:",metrics.recall_score(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.68      0.79      0.73      5215
           1       0.48      0.35      0.41      2932

    accuracy                           0.63      8147
   macro avg       0.58      0.57      0.57      8147
weighted avg       0.61      0.63      0.61      8147

Recall: 0.3506139154160982


**<u> Random Forest </u>**

In [16]:
rf = RandomForestClassifier(n_estimators=400,
                            max_features='sqrt',
                            class_weight='balanced',
                            max_depth=5,
                            oob_score=True,
                            random_state=3,
                            n_jobs=-1)

rf.fit(X_train, y_train)

#training
print(classification_report(y_train, rf.predict(X_train), np.unique(y)))

              precision    recall  f1-score   support

           0       0.76      0.78      0.77     12166
           1       0.60      0.57      0.58      6842

    accuracy                           0.71     19008
   macro avg       0.68      0.68      0.68     19008
weighted avg       0.70      0.71      0.70     19008



In [17]:
#testing
print(classification_report(y_test, rf.predict(X_test), np.unique(y))) 
y_pred = rf.predict(X_test)
print("Recall:",metrics.recall_score(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.74      0.76      0.75      5215
           1       0.56      0.53      0.54      2932

    accuracy                           0.68      8147
   macro avg       0.65      0.65      0.65      8147
weighted avg       0.68      0.68      0.68      8147

Recall: 0.5313778990450204


**<u> SVM (Balanced) </u>**

In [12]:
svm_bal = LinearSVC(C=0.05, max_iter=5000, class_weight='balanced', random_state=3)
svm_bal.fit(X_train,y_train)
#training
print(classification_report(y_train, svm_bal.predict(X_train), np.unique(y))) 

              precision    recall  f1-score   support

           0       0.84      0.75      0.80     12166
           1       0.63      0.75      0.69      6842

    accuracy                           0.75     19008
   macro avg       0.74      0.75      0.74     19008
weighted avg       0.77      0.75      0.76     19008



In [13]:
#testing
print(classification_report(y_test, svm_bal.predict(X_test), np.unique(y))) 
y_pred = svm_bal.predict(X_test)
print("Recall:",metrics.recall_score(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.77      0.70      0.73      5215
           1       0.54      0.63      0.58      2932

    accuracy                           0.67      8147
   macro avg       0.65      0.66      0.66      8147
weighted avg       0.69      0.67      0.68      8147

Recall: 0.6275579809004093


**<u> SVM (Unbalanced) </u>**

In [14]:
svm = LinearSVC(C=0.05, max_iter=5000, class_weight={0: 0.6, 1: 1.5}, random_state=3)
svm.fit(X_train,y_train)
#training
print(classification_report(y_train, svm.predict(X_train), np.unique(y))) 

              precision    recall  f1-score   support

           0       0.90      0.45      0.60     12166
           1       0.48      0.91      0.63      6842

    accuracy                           0.62     19008
   macro avg       0.69      0.68      0.62     19008
weighted avg       0.75      0.62      0.61     19008



In [15]:
#testing
print(classification_report(y_test, svm.predict(X_test), np.unique(y))) 
y_pred = svm.predict(X_test)
print("Recall:",metrics.recall_score(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.84      0.40      0.54      5215
           1       0.45      0.87      0.59      2932

    accuracy                           0.57      8147
   macro avg       0.64      0.63      0.56      8147
weighted avg       0.70      0.57      0.56      8147

Recall: 0.8656207366984994


**<u> Logistic Regression </u>**

In [16]:
logis = LogisticRegression(class_weight = "balanced") 
logis.fit(X_train, y_train)

#training
print(classification_report(y_train, logis.predict(X_train), np.unique(y))) 



              precision    recall  f1-score   support

           0       0.86      0.78      0.82     12166
           1       0.66      0.77      0.71      6842

    accuracy                           0.78     19008
   macro avg       0.76      0.78      0.77     19008
weighted avg       0.79      0.78      0.78     19008



In [17]:
#testing
print(classification_report(y_test, logis.predict(X_test), np.unique(y))) 
y_pred = logis.predict(X_test)
print("Recall:",metrics.recall_score(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.77      0.71      0.74      5215
           1       0.55      0.62      0.58      2932

    accuracy                           0.68      8147
   macro avg       0.66      0.67      0.66      8147
weighted avg       0.69      0.68      0.68      8147

Recall: 0.6193724420190996


### <center>Ensemble Method - Main Characters </center>

In [56]:
ensemble = VotingClassifier(estimators=[('LR',logis),('SVM -Unbalanced', svm)],voting='hard',weights=[0.5,0.5])
ftted= ensemble.fit(X, y)
ensemble_train_preds = ensemble.predict(X_train) #get predicted outputs of the classifier
ensemble_train_f1 = f1_score(y_train, ensemble_train_preds, average='micro')
ensemble_test_preds = ensemble.predict(X_test)
ensemble_test_f1 = f1_score(y_test, ensemble_test_preds, average='micro')
print("\nTrain/test F1 for Ensemble: ", ensemble_train_f1, ensemble_test_f1)

print("Recall:",metrics.recall_score(y_test, ensemble_test_preds))


Train/test F1 for Ensemble:  0.7731481481481481 0.7698539339634222
Recall: 0.7598908594815825


In [57]:
ensemble = VotingClassifier(estimators=[('LR',logis),('SVM -Unbalanced', svm)], voting='hard',weights=[0.5,1])
ftted= ensemble.fit(X, y)

In [59]:
ensemble_train_preds = ensemble.predict(X_train) #get predicted outputs of the classifier
ensemble_train_f1 = f1_score(y_train, ensemble_train_preds, average='micro')
ensemble_test_preds = ensemble.predict(X_test)
ensemble_test_f1 = f1_score(y_test, ensemble_test_preds, average='micro')
print("\nTrain/test F1 for Ensemble: ", ensemble_train_f1, ensemble_test_f1)

print("Recall:",metrics.recall_score(y_test, ensemble_test_preds))
print("Precision:",metrics.precision_score(y_test, ensemble_test_preds))


Train/test F1 for Ensemble:  0.6327335858585859 0.6350803976924021
Recall: 0.9017735334242838
Precision: 0.49615312441358606
