In [None]:
import pandas as pd
import sys
import networkx as nx
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer

import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
text_col = 'Excerpt Copy'

df = pd.read_csv('data/sensitive/coder1_all.tsv', sep='\t')
df = df[['uni', 'Participant', 'Excerpt Copy', 'rank', 'identity',
       'Q3-g', 'Q3-l', 'Q3-b', 'Q3-quest', 'Q3-ace', 'Q3-queer', 'Q4-gq',
       'Q4-t', 'Q4-i', 'Q4-f', 'Q4-m']]

print(df.shape[0])

df.head()

In [None]:
df = df.replace({'Question: Q\d*\w?; Answer:': ''}, regex=True)
df = df.replace({'Question: Q\d*-other; Answer:': ''}, regex=True)

text = df[['uni', 'Participant', 'Excerpt Copy']].groupby(['uni', 'Participant'])
text = text.agg(lambda t: "{%s}" % ' '.join(t))
print(text.shape[0])
text.head()

In [None]:
stemmer = nltk.stem.snowball.SnowballStemmer("english") 

def normalize(text):
    '''remove punctuation, lowercase, stem'''
    clean = text.lower()
    tokens = nltk.tokenize.regexp_tokenize(clean, '\w+')
    return [stemmer.stem(item) for item in tokens]

text.reset_index(inplace=True)

text.head()

In [None]:
vectorizor = TfidfVectorizer(tokenizer=normalize)

In [None]:
sample = text['Excerpt Copy'].values
tfidf = vectorizor.fit_transform(sample)
sims = (tfidf * tfidf.T).A
sims

In [None]:
n = len(sims)
edges = {'i': [], 'j': [], 'cosine_sim': []}

for i in range(n):
    for j in range(i+1, n):
        edges['i'].append(i)
        edges['j'].append(j)
        edges['cosine_sim'].append(sims[i][j])
        
edges = pd.DataFrame(edges)
edges.head()

In [None]:
edges.cosine_sim.hist(bins=20)

In [None]:
edges.to_csv('data/public/cosine_people.tsv', sep='\t')
text=text[['uni', 'Participant']]
text.to_csv('data/public/cosine_people_ids.tsv', sep='\t')

In [None]:
text.head()