In [12]:
import pandas as pd
from utils.preprocess import preprocess_IMDB_sentence
import nltk
import nltk.data
from nltk.tokenize import word_tokenize
import numpy as np
from convokit import Corpus, download, FightingWords, Utterance, Speaker
from collections import Counter
from torchtext.vocab import GloVe
from utils.corpus import get_corpus

[nltk_data] Downloading package punkt to /Users/janek/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [54]:
def corpus_from_source(source, field, n):
    df, _  = get_corpus(source, text_field=field, split_sentences=True, punct=False, to_ascii=True,
                min_len=3, max_len=15,  test_size=0.0001, subsample_rows=n, save=False)
    speaker = Speaker(meta={'name': source})
    utts = [Utterance(id=f'{source}{i}', text=' '.join(t), speaker=speaker) for i, t in enumerate(df.utterance) ]
    corp = Corpus(utterances=utts)
    return df, corp

def compare_corpora(c1, c2, n1, n2):
    print(n1)
    c1.print_summary_stats()
    print(c1.get_utterances_dataframe().text.map(lambda x: x.split()).map(len).describe())
    print('\n'.join(c1.random_utterance().text for _ in range(10)))
    print('\n')
    print(n2)
    c2.print_summary_stats()
    print(c2.get_utterances_dataframe().text.map(lambda x: x.split()).map(len).describe())
    print('\n'.join(c2.random_utterance().text for _ in range(10)))
    ids1 = set(c1.get_utterance_ids())
    corp = c1.merge(c2)
    fw = FightingWords(ngram_range=(1,1))
    fw.fit(corp, class1_func=lambda utt: utt.id in ids1, 
                 class2_func=lambda utt: utt.id not in ids1)
    print(fw.summarize(corp, plot=False, class1_name=n1,
                                          class2_name=n2))
    return fw
    
def run_comparison(source1, field1, source2, field2, n=1000):
    print('loading ', source1)
    df1, c1 = corpus_from_source(source1, field1, n)
    print('loading ', source2)
    df2, c2 = corpus_from_source(source2, field2, n)
    fw = compare_corpora(c1, c2, source1, source2)

def word_freqs(data, voc, n=1000):
    return Counter(w if w in voc else 'unk' for s in data.iloc[:n] for w in s)

In [36]:
# imdb = pd.read_csv('data_options/IMDB Dataset.csv')
# food = pd.read_csv('data_options/Reviews.csv')
# movies = pd.read_csv('data_options/movies_metadata.csv')
# tmbd = pd.read_csv('data_options/tmdb_5000_movies.csv')
# hotels = pd.read_csv('data_options/tripadvisor_hotel_reviews.csv')

In [37]:
# with open('data_options/google-10000-english-usa.txt', 'r') as fh:
#     vocab1 = set(t.strip() for t in fh.readlines())
# g = GloVe('6B',dim=50)
# v = set(g.stoi)

In [56]:
run_comparison('friends-corpus', 'text', 'IMDB Dataset.csv', 'review', n=10000)

loading  friends-corpus
Downloading dataset...
Dataset already exists at /Users/janek/.convokit/downloads/friends-corpus
Cleaning
10024
2
loading  IMDB Dataset.csv
Loading dataset from csv...
Cleaning
32757
4
friends-corpus
Number of Speakers: 1
Number of Utterances: 10024
Number of Conversations: 1
count    10024.000000
mean         6.928372
std          3.260859
min          3.000000
25%          4.000000
50%          6.000000
75%          9.000000
max         15.000000
Name: text, dtype: float64
oh so did i
oh will you do the top of the cabinets
hey rach did you make your money
do you know how hard it is to meet a guy like that
you took your eggs and you left
how about a little of that smoked turkey
i get it
oh no no never say that
do n't be
you are going to make a joke about my special present


IMDB Dataset.csv
Number of Speakers: 1
Number of Utterances: 32757
Number of Conversations: 1
count    32757.000000
mean         9.623165
std          3.527539
min          3.000000
25%    

In [4]:
friends = Corpus(filename=download("friends-corpus"))
# supreme = Corpus(filename=download("supreme-corpus"))
# # parliament = Corpus(filename=download("parliament-corpus"))
# movie = Corpus(filename=download("movie-corpus"))
# diplomacy = Corpus(filename=download("diplomacy-corpus")) # game, written
# switchboard = Corpus(filename=download("switchboard-corpus")) # conversational, from spoken
# reddit = Corpus(filename=download('reddit-corpus-small'))
# tennis = Corpus(filename=download('tennis-corpus'))

Dataset already exists at /Users/janek/.convokit/downloads/friends-corpus
