In [12]:
import pandas as pd
from utils.preprocess import preprocess_IMDB_sentence
import nltk
import nltk.data
from nltk.tokenize import word_tokenize
import numpy as np
from convokit import Corpus, download, FightingWords, Utterance, Speaker
from collections import Counter
from torchtext.vocab import GloVe
from utils.corpus import get_corpus

[nltk_data] Downloading package punkt to /Users/janek/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [35]:
def corpus_from_source(source, field, n):
    df, _  = get_corpus(source, text_field=field, split_sentences=True, punct=False, to_ascii=True,
                min_len=3, max_len=15,  test_size=0.0001, subsample_rows=n, save=False)
    speaker = Speaker(meta={'name': source})
    utts = [Utterance(id=f'{source}i', text=' '.join(t), speaker=speaker) for i, t in enumerate(df.utterance) ]
    corp = Corpus(utterances=utts)
    return df, corp

def compare_corpora(c1, c2, n1, n2):
    print(n1)
    c1.print_summary_stats()
    print(c1.get_utterances_dataframe().text.map(lambda x: x.split()).map(len).describe())
    print('\n'.join(c1.random_utterance().text for _ in range(10)))
    print('\n')
    print(n2)
    c2.print_summary_stats()
    print(c2.get_utterances_dataframe().text.map(lambda x: x.split()).map(len).describe())
    print('\n'.join(c2.random_utterance().text for _ in range(10)))
    ids1 = set(c1.get_utterance_ids())
    corp = c1.merge(c2)
    fw = FightingWords(ngram_range=(1,1))
    fw.fit(corp, class1_func=lambda utt: utt.id in ids1, 
                 class2_func=lambda utt: utt.id not in ids1)
    print(fw.summarize(corp, plot=False, class1_name=n1,
                                          class2_name=n2))
    return fw
    
def run_comparison(source1, field1, source2, field2, n=1000):
    print('loading ', source1)
    df1, c1 = corpus_from_source(source1, field1, n)
    print('loading ', source2)
    df2, c2 = corpus_from_source(source2, field2, n)
    fw = compare_corpora(c1, c2, source1, source2)

def word_freqs(data, voc, n=1000):
    return Counter(w if w in voc else 'unk' for s in data.iloc[:n] for w in s)

In [36]:
# imdb = pd.read_csv('data_options/IMDB Dataset.csv')
# food = pd.read_csv('data_options/Reviews.csv')
# movies = pd.read_csv('data_options/movies_metadata.csv')
# tmbd = pd.read_csv('data_options/tmdb_5000_movies.csv')
# hotels = pd.read_csv('data_options/tripadvisor_hotel_reviews.csv')

In [37]:
# with open('data_options/google-10000-english-usa.txt', 'r') as fh:
#     vocab1 = set(t.strip() for t in fh.readlines())
# g = GloVe('6B',dim=50)
# v = set(g.stoi)

In [39]:
df, c = corpus_from_source('friends-corpus', 'text', n=100)

Downloading dataset...
Dataset already exists at /Users/janek/.convokit/downloads/friends-corpus
Cleaning
108
1


In [None]:
print('\n'.join(c.random_utterance().text for _ in range(10)))

In [38]:
run_comparison('friends-corpus', 'text', 'IMDB Dataset.csv', 'review', n=100)

loading  friends-corpus
Downloading dataset...
Dataset already exists at /Users/janek/.convokit/downloads/friends-corpus
Cleaning
108
1
loading  IMDB Dataset.csv
Loading dataset from csv...
Cleaning
324
1
friends-corpus
Number of Speakers: 1
Number of Utterances: 1
Number of Conversations: 1
count    1.0
mean     4.0
std      NaN
min      4.0
25%      4.0
50%      4.0
75%      4.0
max      4.0
Name: text, dtype: float64
okay it 's um
okay it 's um
okay it 's um
okay it 's um
okay it 's um
okay it 's um
okay it 's um
okay it 's um
okay it 's um
okay it 's um


IMDB Dataset.csv
Number of Speakers: 1
Number of Utterances: 1
Number of Conversations: 1
count     1.0
mean     13.0
std       NaN
min      13.0
25%      13.0
50%      13.0
75%      13.0
max      13.0
Name: text, dtype: float64
i saw this stage show when it was broadcast on pbs in 1983
i saw this stage show when it was broadcast on pbs in 1983
i saw this stage show when it was broadcast on pbs in 1983
i saw this stage show when i

ValueError: max_df corresponds to < documents than min_df

In [4]:
friends = Corpus(filename=download("friends-corpus"))
# supreme = Corpus(filename=download("supreme-corpus"))
# # parliament = Corpus(filename=download("parliament-corpus"))
# movie = Corpus(filename=download("movie-corpus"))
# diplomacy = Corpus(filename=download("diplomacy-corpus")) # game, written
# switchboard = Corpus(filename=download("switchboard-corpus")) # conversational, from spoken
# reddit = Corpus(filename=download('reddit-corpus-small'))
# tennis = Corpus(filename=download('tennis-corpus'))

Dataset already exists at /Users/janek/.convokit/downloads/friends-corpus


Dataset already exists at /Users/janek/.convokit/downloads/friends-corpus
Dataset already exists at /Users/janek/.convokit/downloads/switchboard-corpus
friends-corpus
Number of Speakers: 700
Number of Utterances: 67373
Number of Conversations: 3107
count    67373.000000
mean         9.245454
std         10.396307
min          0.000000
25%          2.000000
50%          6.000000
75%         13.000000
max        196.000000
Name: text, dtype: float64
...now remember you have to imagine me in a kilt.
No, no, no.
Oh no, no, no, no, no, no. Don't get me wrong. No, he's not in like a sissy way. No, no, no... when he gets going, he can rattle a headboard like a sailor on leave...
They won't be ready for weeks.
I'm not supposed to tell you.
I can't believe Ross went out with Rachel's sister! When Chandler made out with my sister I was mad at him for 10 years.
Ya know, the man's got a point.
Oh Ross!!
This is a girl that I really like and had too swoop in there!
Phoebe?


switchboard-corpus
Numb

Unnamed: 0_level_0,z-score,class
ngram,Unnamed: 1_level_1,Unnamed: 2_level_1
uh,-103.397367,supreme court
and,-71.331281,supreme court
they,-62.526009,supreme court
huh,-61.455047,supreme court
of,-50.703762,supreme court
...,...,...
this,57.471653,friends
okay,59.311416,friends
you,66.945929,friends
me,67.194980,friends
