In [131]:
import pandas as pd
import os
from progressbar import ProgressBar
import numpy as np
import re
from sklearn.feature_extraction.text import CountVectorizer
from textblob import TextBlob, Word
import string
import nltk

In [132]:
from wordcloud import WordCloud, STOPWORDS
import matplotlib.pyplot as plt
import spacy
nlp = spacy.load("en_core_web_sm")

In [133]:
stopwords = set(STOPWORDS)
from spacy.lang.en.stop_words import STOP_WORDS

In [134]:
def getData(file):
    data = pd.read_csv("../data/extracted/"+file,delimiter=',')
    moviename = ' '.join(file[:-4].split('_'))
    return data, moviename

In [135]:
def convertToModelInputFormat(data):
    ans = []
    speakers = list(set(list(data['Speaker'])))
    for speaker in speakers:
        dialogues = data[data['Speaker']==speaker]
        dialogues = list(dialogues['Dialogue'])
        temp = []
        temp.append(speaker)
        temp.append(dialogues)
        ans.append(temp)
    return ans

In [136]:
def getTrueLabels(df, moviename):
    labelsfile = pd.read_csv('../data/Pre-processing_files/polygraph_matched_scriptid_title_gender.txt', 
                         delimiter='\t', names=['Match', 'Movie', 'Code', 'Character', 'Gender'])
    labelsfile = labelsfile[labelsfile['Match']=='Matched']

    movie = labelsfile[labelsfile['Movie']==moviename]
    if movie is not None:
        movie = movie[['Character', 'Gender']]
        movie['Character'] = movie['Character']
        temp = list(df['Speaker'])
        gen = []
#         print(temp, movie['Character'])
        count = 0
        for speaker in temp:
            try:
                gender = movie[movie['Character'].str.contains(speaker)]
                gender = list(gender['Gender'])
                gen.append(gender[0])
            except:
                gen.append(np.nan)
        df['Gender'] = gen
    return df

In [137]:
entire_data = pd.DataFrame([['Speaker', ['d1']]], columns=['Speaker', 'Dialogues'])
files = os.listdir('../data/extracted')
files.sort()
pbar = ProgressBar()
for file in pbar(files):
    if file[-4:] != ".csv":
        continue
    else:
        data, moviename = getData(file)
        ans = convertToModelInputFormat(data)
        df = pd.DataFrame(ans, columns = ['Speaker', 'Dialogues'])
        df = getTrueLabels(df, moviename)
        df['Movie'] = moviename
        entire_data = entire_data.append(df, ignore_index=True)

entire_data = entire_data.drop(0)
entire_data.head()

100% |########################################################################|


Unnamed: 0,Speaker,Dialogues,Gender,Movie
1,patrick,"[I missed you., It was a bratwurst. I was eat...",m,10 Things I Hate About You
2,bogey,"[Nice to see you. Martini bar to the right, s...",,10 Things I Hate About You
3,michael,"[You the new guy?, C'mon. I'm supposed to giv...",m,10 Things I Hate About You
4,pepe,"[Some people like the Colombian, but it all de...",,10 Things I Hate About You
5,cameron,"[I don't think so, ma'am, So they tell me..., ...",m,10 Things I Hate About You


In [138]:
entire_data.shape

(42843, 4)

In [139]:
def getRightFormat(entire_data):
    new_data = entire_data[['Speaker', 'Dialogues', 'Gender']]
    dialogues = new_data['Dialogues'].tolist()
    
    new_d = [ ' '.join(dial) for dial in dialogues]
    
    new_data['Dialogues'] = new_d
    df = new_data.groupby('Gender')['Dialogues'].apply(' '.join).reset_index()    
    df = df.set_index('Gender')
    return df

In [140]:
wc_data = getRightFormat(entire_data)
# male = entire_data[entire_data['Gender'] == 'm']
# male.shape

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys


In [141]:
wc_data.head()
# female = entire_data[entire_data['Gender'] == 'f']
# female.shape

Unnamed: 0_level_0,Dialogues
Gender,Unnamed: 1_level_1
?,"Where'd you get all this Nazi stuff? Boss, Osw..."
f,Did you change your hair? You might wanna thin...
m,I missed you. It was a bratwurst. I was eatin...


In [142]:
def clean_data(phrase):
    
    phrase = re.sub(r"can\'t", "can not", phrase)
    phrase = re.sub(r"can\'t", "can not", phrase)
    phrase = re.sub(r"don\'t", "do not", phrase)

    # general
    phrase = re.sub(r"n\'t", " not", phrase)
    phrase = re.sub(r"\'re", " are", phrase)
    phrase = re.sub(r"\'s", " is", phrase)
    phrase = re.sub(r"\'d", " would", phrase)
    phrase = re.sub(r"\'ll", " will", phrase)
    phrase = re.sub(r"\'t", " not", phrase)
    phrase = re.sub(r"\'ve", " have", phrase)
    phrase = re.sub(r"\'m", " am", phrase)
    phrase = re.sub(r"[0-9]", "", phrase)
    
    phrase=phrase.strip(string.punctuation)
    
    return phrase

In [143]:
wc_data["Dialogues"] = wc_data["Dialogues"].apply(clean_data)

In [144]:
wc_data.head()

Unnamed: 0_level_0,Dialogues
Gender,Unnamed: 1_level_1
?,"Where would you get all this Nazi stuff? Boss,..."
f,Did you change your hair? You might wanna thin...
m,I missed you. It was a bratwurst. I was eatin...


In [145]:
def diff(li1, li2):
    return (list(list(set(li1)-set(li2)) + list(set(li2)-set(li1))))
 
def updateStopWords(wc_data):
    wordss = list(set(nltk.corpus.words.words()))
    df2 = wc_data['Dialogues'].apply(' '.join).reset_index()
    df2 = ' '.join(list(df2))
    df2 = df2.lower().split(' ')
    stop = diff(df2, wordss)
    return stop

In [146]:
stop = updateStopWords(wc_data)
STOP_WORDS.update(stop)

In [147]:
# wordss = list(set(nltk.corpus.words.words()))

In [161]:
# if "rm" in wordss:
#     print("yes")
wc_data.head()

Unnamed: 0_level_0,Dialogues
Gender,Unnamed: 1_level_1
?,"Where would you get all this Nazi stuff? Boss,..."
f,Did you change your hair? You might wanna thin...
m,I missed you. It was a bratwurst. I was eatin...


In [176]:
def vectorise(clean_data):
    speakers = df.index.unique()
    corpus = [' '.join(clean_data.loc[(clean_data.index==candidate)]['Dialogues'].tolist()) for candidate in speakers]
    cv=CountVectorizer( stop_words=STOP_WORDS, ngram_range=(1, 3))
    X = cv.fit_transform(corpus)
    X = X.toarray()
    bow=pd.DataFrame(X, columns = cv.get_feature_names())
    bow.index=speakers
    return bow, speakers

In [177]:
bow, genders = vectorise(wc_data)

  'stop_words.' % sorted(inconsistent))


ValueError: empty vocabulary; perhaps the documents only contain stop words

In [None]:
bow.head()

In [None]:
def getWordCloud(bow, col):
    text=bow.loc[col].sort_values(ascending=False)[:4000]
    text2_dict=bow.loc[col].sort_values(ascending=False).to_dict()
    wordcloud = WordCloud(min_word_length =3,
                          background_color='white')

    wordcloud.generate_from_frequencies(text2_dict)

    plt.figure(figsize=(8,4))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis('off')
    plt.show()

In [None]:
getWordCloud(bow, 'f')

In [None]:
getWordCloud(bow, 'm')

In [None]:
import math

In [None]:
def logOdds(speakers, bow):
    t_bow_df=pd.DataFrame()
    bow_transformed = bow.apply(lambda x: (x+1)/ (bow.loc[x.name].sum()+1), axis=1)
    pbar = ProgressBar()
    for candidate in pbar(speakers):
        feat_s = bow[bow.index != candidate].sum()+1 #sum all the times each word was said excluding the candidate
        feat_s_all = np.sum(bow[bow.index != candidate].sum()) # all the total words that were said, excluding the candidate
        tot = feat_s / feat_s_all #find the frequency of words said among other candidates
        row = bow_transformed.loc[candidate] / tot #divide the candidate's array of frequency of words by all other candidates' frequency
        row = pd.DataFrame(row).T #create a df, and transform
        results = row.apply(lambda x: math.log(x,2)) #take the log of each frequency

        t_bow_df = pd.concat([t_bow_df, pd.DataFrame([results], index=[candidate])]) #append to df
    return t_bow_df

In [None]:
bow_df = logOdds(['f', 'm'], bow)

In [None]:
bow_df.head()

In [None]:
bow_df.loc['f'].sort_values(ascending=False)[:10]

In [None]:
bow_df.loc['m'].sort_values(ascending=False)[:10]

In [None]:
def getCloud(bow_df, col):
    text = bow_df.loc[col].sort_values(ascending=False)[:4000]
    text3_dict={k: v for k, v in sorted(text.items(),reverse=True, key=lambda item: item[1])}
    wordcloud = WordCloud(min_word_length =3,
                      background_color='white').generate_from_frequencies(text3_dict)

    plt.figure(figsize=(8,4))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis('off')
    plt.show()

In [None]:
getCloud(bow_df, 'f')

In [None]:
getCloud(bow_df, 'm')