# Importing Libraries

In [None]:
import nltk
import pandas as pd
import numpy as np
import re
import xgboost as xgb



# Importing Dataset and reducing the dimensions and no. of classes

In [None]:
from google.colab import files
uploaded=files.upload()
import io
df = pd.read_csv(io.BytesIO(uploaded['All-seasons.csv']))
#df=pd.read_csv('All-seasons.csv')

Saving All-seasons.csv to All-seasons.csv



This dataset contains dialogues and speakers from season 9 and 10 of the animated series south park.
The season and the episode are irrelevant to our task, i.e., to predict the speaker from the dialogue.


In [None]:
classes=['Cartman','Stan','Kyle','Butters','Randy','Mr. Garrison','Kenny']
data=df.loc[df['Character'].isin(classes)]
data=data.iloc[:,2:4]
print(data.head(10))
shape=data.shape
print(shape)

   Character                                               Line
0       Stan         You guys, you guys! Chef is going away. \n
1       Kyle                        Going away? For how long?\n
2       Stan                                         Forever.\n
4       Stan  Chef said he's been bored, so he joining a gro...
9    Cartman  I'm gonna miss him.  I'm gonna miss Chef and I...
10      Stan  Dude, how are we gonna go on? Chef was our fuh...
17     Randy  Good-bye, Chef! Have a great time with the Sup...
19      Kyle                           Draw two card, fatass.\n
20   Cartman                            Reverse to you, Jew. \n
21      Stan                                    I'll get it. \n
(31505, 2)


# Creating Corpus

function for obtaining the part of speech for lemmatisation

In [None]:
#nltk.download('popular')
from nltk import wordnet
def get_wordnet_pos(tag):
 
    if tag.startswith('J'):
        return "a"
    elif tag.startswith('V'):
        return "v"
    elif tag.startswith('N'):
        return "n"
    elif tag.startswith('R'):
        return "r"
    else:
        return "n"

cleaning the texts and creating the corpus

In [None]:
nltk.download('popular')
corpus=[]
from nltk.stem import WordNetLemmatizer
lemmatiser=WordNetLemmatizer()
for dialog in data.Line:
    dialog=nltk.word_tokenize(dialog)
    for i in range(0,len(dialog)):
        dialog[i]= re.sub('\'s','is',dialog[i])
        dialog[i]=re.sub('\'m','am',dialog[i])
        dialog[i]=re.sub('n\'t','not',dialog[i])
        dialog[i]=re.sub('\'ve','have',dialog[i])
        dialog[i]=re.sub('[^a-zA-Z]',' ',dialog[i])
        dialog[i]=dialog[i].lower()
    pos=nltk.pos_tag(dialog)
    dialog=[]
    for word,tag in pos:
        if (word is not ' '):
            l=lemmatiser.lemmatize(word,get_wordnet_pos(tag))
            dialog.append(l)
    dialog=' '.join(dialog)
    corpus.append(dialog)

[nltk_data] Downloading collection 'popular'
[nltk_data]    | 
[nltk_data]    | Downloading package cmudict to /root/nltk_data...
[nltk_data]    |   Package cmudict is already up-to-date!
[nltk_data]    | Downloading package gazetteers to /root/nltk_data...
[nltk_data]    |   Package gazetteers is already up-to-date!
[nltk_data]    | Downloading package genesis to /root/nltk_data...
[nltk_data]    |   Package genesis is already up-to-date!
[nltk_data]    | Downloading package gutenberg to /root/nltk_data...
[nltk_data]    |   Package gutenberg is already up-to-date!
[nltk_data]    | Downloading package inaugural to /root/nltk_data...
[nltk_data]    |   Package inaugural is already up-to-date!
[nltk_data]    | Downloading package movie_reviews to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Package movie_reviews is already up-to-date!
[nltk_data]    | Downloading package names to /root/nltk_data...
[nltk_data]    |   Package names is already up-to-date!
[nltk_data]    | Do

Encoding Target Variable

In [None]:
y=data['Character']
from sklearn.preprocessing import LabelEncoder
le=LabelEncoder()
y=le.fit_transform(y)
le.classes_


array(['Butters', 'Cartman', 'Kenny', 'Kyle', 'Mr. Garrison', 'Randy',
       'Stan'], dtype=object)

# Creating the Bag of Words model

In [None]:

from sklearn.feature_extraction.text import TfidfVectorizer
tv = TfidfVectorizer(ngram_range=(3,3),max_features=500)
x = tv.fit_transform(corpus).toarray()


# Splitting the dataset into the Training set and Test set

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42, stratify=y)


# training the model

In [None]:

from sklearn.datasets import dump_svmlight_file
dump_svmlight_file(X_train, y_train, 'dtrain.svm', zero_based=True)
dump_svmlight_file(X_test, y_test, 'dtest.svm', zero_based=True)
dtrain_svm = xgb.DMatrix('dtrain.svm')
dtest_svm = xgb.DMatrix('dtest.svm')


[16:10:51] 25204x500 matrix with 20926 entries loaded from dtrain.svm
[16:10:51] 6301x500 matrix with 5292 entries loaded from dtest.svm


In [None]:
param = {
    'max_depth': 5,  # the maximum depth of each tree
    'eta': 0.1,  # the training step for each iteration
    'silent': 1,  # logging mode - quiet
    'objective': 'multi:softprob',  # error evaluation for multiclass training
    'num_class': 8}  # the number of classes that exist in this datset
num_round = 100  # the number of training iterations
bst = xgb.train(param, dtrain_svm, num_round)
preds = bst.predict(dtest_svm)
best_preds = np.asarray([np.argmax(line) for line in preds])

In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix
print(accuracy_score(y_test, best_preds))
print(confusion_matrix(y_test, best_preds))

0.3223297889223933
[[   5  500    0    6    0    0   10]
 [   2 1842    3   23    0    5   80]
 [   0  170    0    0    0    0    6]
 [   1 1260    1   51    0    7  100]
 [   0  193    0    2    0    0    5]
 [   0  460    0    5    0    2   26]
 [   1 1374    0   24    0    6  131]]
