In [135]:
import pandas as pd
import os
from progressbar import ProgressBar
import numpy as np

In [136]:
def getData(file):
    data = pd.read_csv("../data/extracted/"+file,delimiter=',')
    moviename = ' '.join(file[:-4].split('_'))
#     print(moviename)
    return data, moviename

In [137]:
def convertToModelInputFormat(data):
    ans = []
    speakers = list(set(list(data['Speaker'])))
    for speaker in speakers:
        dialogues = data[data['Speaker']==speaker]
        dialogues = list(dialogues['Dialogue'])
        temp = []
        temp.append(speaker)
        temp.append(dialogues)
        ans.append(temp)
    return ans

In [138]:
def getTrueLabels(df, moviename):
    labelsfile = pd.read_csv('../data/Pre-processing_files/polygraph_matched_scriptid_title_gender.txt', 
                         delimiter='\t', names=['Match', 'Movie', 'Code', 'Character', 'Gender'])
    labelsfile = labelsfile[labelsfile['Match']=='Matched']

    movie = labelsfile[labelsfile['Movie']==moviename]
    if movie is not None:
        movie = movie[['Character', 'Gender']]
        movie['Character'] = movie['Character']
        temp = list(df['Speaker'])
        gen = []
#         print(temp, movie['Character'])
        count = 0
        for speaker in temp:
            try:
                gender = movie[movie['Character'].str.contains(speaker)]
                gender = list(gender['Gender'])
                gen.append(gender[0])
            except:
                gen.append(np.nan)
        df['Gender'] = gen
    return df

In [139]:
def test_train_split(entire_data):
    data = entire_data.copy()
    train_set = data.sample(frac=0.70, random_state=0)
    test_set = data.drop(train_set.index)
    train_set_labels = train_set.pop('Gender')
    test_set_labels = test_set.pop('Gender')
    

In [140]:
entire_data = pd.DataFrame([['Speaker', ['d1']]], columns=['Speaker', 'Dialogues'])
files = os.listdir('../data/extracted')
files.sort()
pbar = ProgressBar()
for file in pbar(files):
    if file[-4:] != ".csv":
        continue
    else:
        data, moviename = getData(file)
        ans = convertToModelInputFormat(data)
        df = pd.DataFrame(ans, columns = ['Speaker', 'Dialogues'])
        df = getTrueLabels(df, moviename)
        df['Movie'] = moviename
        entire_data = entire_data.append(df, ignore_index=True)

entire_data = entire_data.drop(0)
entire_data

100% |########################################################################|


Unnamed: 0,Speaker,Dialogues,Gender,Movie
1,girl,[Awesome shoes.],,10 Things I Hate About You
2,cameron,"[I don't think so, ma'am, So they tell me..., ...",m,10 Things I Hate About You
3,cowboy,"[And that's for the fourth grade, asshole.]",,10 Things I Hate About You
4,mandella,"[Uh, yeah, I read it all, Haven't you?, Who's ...",f,10 Things I Hate About You
5,patrick,"[I missed you., It was a bratwurst. I was eat...",m,10 Things I Hate About You
...,...,...,...,...
42839,ivan 1,[It's sweet.],,xXx
42840,shavers,[Hate those Russkie choppers. Rattle- trap pie...,,xXx
42841,gibbons,"[Evening, Sam., Not a whole helluva lot. His f...",,xXx
42842,jefe,[Do you know what we do with people come round...,m,xXx


In [143]:
entire_data.to_csv('../data/extracted/entire_data.csv', index=False)

In [2]:
# entire_data['Dialogues'].to_list()[1][0]

In [141]:
null_url = pd.isnull(entire_data['Gender'])
temp = entire_data[null_url]

In [142]:
temp

Unnamed: 0,Speaker,Dialogues,Gender,Movie
1,girl,[Awesome shoes.],,10 Things I Hate About You
3,cowboy,"[And that's for the fourth grade, asshole.]",,10 Things I Hate About You
6,sharon,"[In the microwave., What's a synonym for throb...",,10 Things I Hate About You
9,pepe,"[Some people like the Colombian, but it all de...",,10 Things I Hate About You
10,bruce,"[Take it easy on the guys in there., Next time...",,10 Things I Hate About You
...,...,...,...,...
42838,nerdy agent,[This is your communicator. You'll identify yo...,,xXx
42839,ivan 1,[It's sweet.],,xXx
42840,shavers,[Hate those Russkie choppers. Rattle- trap pie...,,xXx
42841,gibbons,"[Evening, Sam., Not a whole helluva lot. His f...",,xXx


In [144]:
temp.to_csv('../data/extracted/entire_data_nan_values.csv', index=False)