In [None]:
# importing modules needed
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
import re
from nltk.corpus import stopwords
import warnings;
warnings.filterwarnings('ignore');
import torch
import time
from sklearn.neighbors import KNeighborsClassifier
import pickle
import os
from transformers import DistilBertTokenizer,DistilBertModel
from torch.utils.data import Dataset,DataLoader
from sklearn import naive_bayes, svm
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split,StratifiedKFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import log_loss,accuracy_score, classification_report, f1_score, confusion_matrix, plot_confusion_matrix
from sklearn.cluster import KMeans
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")


In [None]:
cur_dir = os.getcwd()
input_dir = '/home/gusmavko@GU.GU.SE/MovieScriptsParticipantsData/'
script_dir = '/home/gusmavko@GU.GU.SE/MovieScriptsParticipantsData/Scripts'

## TF-IDF as Features

In [None]:
text_df = pd.read_csv('Train.csv')

In [None]:
text_df['text'] = [open(script_dir + os.sep + file, "r").read() for file in text_df['File_Name']]

In [None]:

def clean_text(text):
    tokenized_text = []
    sw = stopwords.words(('english'))
    # remove backslash-apostrophe 
    text = re.sub("\'", "", text) 
    # remove everything except alphabets 
    text = re.sub("[^a-zA-Z]"," ",text) 
    # remove whitespaces and convert to lowercase 
    text = ' '.join(text.split()).lower() 
    
    no_stopword_text = [word for word in text.split() if not word in sw]
     
    return ' '.join(no_stopword_text)

In [None]:
text_df = text_df.sample(frac=1).reset_index(drop=True)
replacements = {9:8,21:8,20:8,3:15,10:15,12:15,13:15,2:7,18:1,17:1}
text_df['Labels'] = text_df['Labels'].replace(replacements)
# Manual mapping of integer label to informative label
di = {6:'scifi-comedy',19:'action-crime', 4: 'dramedy', 0: 'action-drama',15: 'comedy', 5:'thriller', 1: 'action-comedy', 8:'adventure-drama',16:'scifi',11:'horror-mystery',14:'crime-thriller',7:'family'}
text_df= text_df.replace({"Labels": di})


In [None]:
text_df.Labels.value_counts().plot(kind='bar',xlabel = 'Genre', ylabel = '# of scripts')


Comparing accuracy scores for different classifiers from sklearn

In [None]:
def get_accuracy(train_x,train_y,test_x,test_y,classifier):
    vectorizer = TfidfVectorizer(max_features=10000)
    vectorizer.fit(text_df['text'])
    train_x_vec = vectorizer.transform(train_x)
    test_x_vec = vectorizer.transform(test_x)
    classifier.fit(train_x_vec,train_y)
    predictions = classifier.predict(test_x_vec)
    #for test, prediction, label in zip(test_x_vec, predictions, test_y):
    #    if prediction != label:
    #        print(test, 'has been classified as ', prediction, 'and should be ', label)
    accuracy = f1_score(predictions,test_y,average='weighted')
    accuracy = 100*accuracy
    
    acc = round((accuracy_score(predictions,test_y)*100),2)
    return round(accuracy,2),acc
    
# split into Train/Test to extract TF-IDF features and define the classifiers from sklearn I will use on the data    
Train_X, Test_X, Train_Y, Test_Y = train_test_split(text_df['text'],text_df['Labels'],test_size=0.2)
nb = naive_bayes.MultinomialNB()
svm = svm.SVC(C=1.0, kernel='linear', degree=1, gamma='auto')
kn = KNeighborsClassifier(5)
lr = LogisticRegression(random_state=0)


Accuracy for Naive Bayes Multinomial Classifier

In [None]:
acc_nb = get_accuracy(Train_X,Train_Y,Test_X, Test_Y, nb)
acc_nb


Accuracy for SVM classifier

In [None]:
acc_svm = get_accuracy(Train_X,Train_Y,Test_X, Test_Y, svm)
acc_svm

Accuracy for KNeighbors Classifier

In [None]:
acc_kn = get_accuracy(Train_X,Train_Y,Test_X, Test_Y, kn)
acc_kn

Accuracy for Logistic Regression

In [None]:
log_reg_acc = get_accuracy(Train_X,Train_Y,Test_X, Test_Y, lr)
log_reg_acc

In [None]:
df_text = pd.read_csv('Train.csv')
replacements = {9:8,21:8,20:8,3:15,10:15,12:15,13:15,2:7,18:1,17:1}
df_text['Labels'] =df_text['Labels'].replace(replacements)

## BERT embeddings as features

In [None]:
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = DistilBertModel.from_pretrained('distilbert-base-uncased')


In [None]:
def script_reader(file_name):
    script_path = os.path.join(cur_dir,script_dir,file_name)
    with open (script_path, "r") as myfile:
        script =  myfile.read()
        return script

In [None]:
# since the scripts are so long, I had to feed the model sequences of 512 length 
class ScriptDataset(Dataset):
    def __init__(self, file_path,tokenizer,seq_len = 510):
        #Sequence length = Maximum possible sequence length(512) which can be fed to model - start and end tokens(2) 
        self.file_path =  file_path
        self.script = script_reader(self.file_path)
        #Each script is loaded only when it is being processed and deleted once tokenization and encoding is done
        self.tokenizer = tokenizer
        self.seq_len = seq_len
        self.tokens = self.tokenizer.tokenize(self.script)
        del self.script
        self.sequences = [self.tokens[i:i+self.seq_len] for i in range(0,len(self.tokens),self.seq_len)][:-1]
        #Drops last sequence of unequal length
        del self.tokens
        
    def __len__(self):
        return len(self.sequences) 
    
    def __getitem__(self, idx):
        return torch.tensor(self.tokenizer.encode(self.sequences[idx], add_special_tokens=True))

In [None]:
def create_bert_embedding(file_path,tokenizer,model,batch_size = 2):
    model = model.to(device)
    model = model.eval()
    
    script_set = ScriptDataset(file_path,tokenizer)
    batch_loader = DataLoader(script_set,batch_size = batch_size, shuffle = False,drop_last=False)
    del script_set
    
    seq_embeddings = []
    for i,batch in enumerate(batch_loader):
        batch = batch.to(device)
        with torch.no_grad():
            seq_embed = model(batch)[0][:,0,:]
            #Collecting only the final hidden state corresponding to CLS token which can be used as sequence embedding
        del batch
        seq_embeddings.append(seq_embed)
        del seq_embed
    del batch_loader
           
    return torch.cat(seq_embeddings,dim = 0).cpu().detach().numpy()

In [None]:
Embeddings = []
for i in range(len(df_text)):
    Embeddings.append(create_bert_embedding(df_text.File_Name[i],tokenizer,model))
    if i % 50 == 0:
        print(i)



In [None]:
with open('/home/gusmavko@GU.GU.SE/MovieScriptsParticipantsData/s_embeddings.pickle', 'wb') as handle:
    pickle.dump(Embeddings, handle, protocol=pickle.HIGHEST_PROTOCOL)
    
df_text['Embeds'] = Embeddings

with open('/home/gusmavko@GU.GU.SE/MovieScriptsParticipantsData/df_raw_embeds2.pickle', 'wb') as handle:
    pickle.dump(df_text, handle, protocol=pickle.HIGHEST_PROTOCOL)

print('Pickling completed')
#Saving the dataframe so that further analysis can start from this point

In [None]:
# to be able to start from here by loading pickled file whenever I want to process it
import pickle
df_embed_path = '/home/gusmavko@GU.GU.SE/MovieScriptsParticipantsData/df_raw_embeds.pickle'

with open(df_embed_path, 'rb') as handle:
    df_text = pickle.load(handle)


In [None]:
#Do kfold cross validation to get accuracy scores
df_text["kfold"] = -1
skf = StratifiedKFold()
for fold, (train_idx, val_idx) in enumerate(skf.split(X=df_text, y=df_text['Labels'].values)):
                    df_text.loc[val_idx, 'kfold'] = fold
        

In [None]:
# make new data frame with arrays that contain the features as values
def create_df_embeds(feat_array, df):
    cols_num = feat_array[0].shape[0]
    col_names = ['feat_' + str(i+1) for i in range(cols_num)]
    df_feat = pd.DataFrame(data = feat_array,columns = col_names)
    df_fit = pd.concat([df, df_feat], axis=1)
    return df_fit,col_names

In [None]:
# training function
def train(df_fit,classifier,feat_cols,label_col = 'Labels'):
    
    folds = df_fit.kfold.nunique()
    FOLD_MAPPPING = {fold:list(set(range(folds)) - set([fold]))  for fold in range(folds)}
    clfs = [classifier]*folds
    
    train_losses = []
    train_accuracies = []
    valid_accuracies = []
    
    for FOLD in range(5):
        
        train_df = df_fit[df_fit.kfold.isin(FOLD_MAPPPING.get(FOLD))].reset_index(drop=True)
        valid_df = df_fit[df_fit.kfold==FOLD].reset_index(drop=True)
        
        clf = clfs[FOLD]
        clf.fit(train_df[feat_cols].values, train_df[label_col])
        
        train_probs = clf.predict_proba(train_df[feat_cols].values)
        train_preds = clf.predict(train_df[feat_cols].values)
        
        train_accuracy = accuracy_score(train_df[label_col], train_preds)
        train_loss = log_loss(train_df[label_col], train_probs)
                        
        train_losses.append(train_loss)
        train_accuracies.append(train_accuracy)

        valid_probs = clf.predict_proba(valid_df[feat_cols].values)
        valid_preds = clf.predict(valid_df[feat_cols].values)

        valid_accuracy = accuracy_score(valid_df[label_col], valid_preds)
     
        valid_accuracies.append(valid_accuracy)
        
    mean_train_loss,std_train_loss = np.mean(train_losses),np.std(train_losses)
    mean_train_accuracy,std_train_accuracy  = np.mean(train_accuracies),np.std(train_accuracies)
        
    mean_valid_loss,std_valid_loss = np.mean(valid_losses),np.std(valid_losses)
    mean_valid_accuracy,std_valid_accuracy  = np.mean(valid_accuracies),np.std(valid_accuracies)

    print(f'train loss = {mean_train_loss} with {np.round((std_train_loss/mean_train_loss)*100, 2)}% error')

    print(f'train accuracy = {mean_train_accuracy} with {np.round((std_train_accuracy/mean_train_accuracy)*100, 2)}% error')
    print(f'valid accuracy = {mean_valid_accuracy} with {np.round((std_valid_accuracy/mean_valid_accuracy)*100, 2)}% error')
    return clfs

In [None]:
seed = 123 # setting seed for reproducibility of the experiments

torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
np.random.seed(seed)

mean_array = np.stack((df_text.Embeds.apply(lambda x: x.mean(axis = 0)).values),axis = 0)
df_mean,cols = create_df_embeds(mean_array, df_text)
classif = LogisticRegression(random_state=seed)
LR_clfs = train(df_mean,classif,cols)

In [None]:
train_accuracies = [0.3786346396965866, 0.37041719342604296, 0.37420986093552466, 0.37965887555274797, 0.36765634870499053]
valid_accuracies = [0.24242424242424243, 0.2702020202020202, 0.25252525252525254, 0.25569620253164554, 0.2683544303797468]

def make_pct(l):
    return [round((x*100),2) for x in l]
train_accuracies = make_pct(train_accuracies)
valid_acuuracies = make_pct(valid_accuracies)


In [None]:
folds = range(1,6)
plt.plot(folds, train_accuracies, 'g', label='Train accuracy')
plt.plot(folds, valid_acuuracies, 'b', label='Test accuracy')
plt.title('Training and Test accuracy')
plt.xlabel('Folds')
plt.ylabel('Accuracy')
plt.legend()
axes = plt.gca()
axes.set_ylim([0,100])
plt.xticks([1,2,3,4,5])
plt.show()