In [239]:
import pandas as pd
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn import linear_model
import sklearn.metrics as m
import matplotlib.pyplot as plt
from sklearn.naive_bayes import MultinomialNB
import itertools
import seaborn as sn
from textblob import TextBlob
%matplotlib inline
import nltk
from nltk.stem import *
import re 
import os
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize 
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.decomposition import KernelPCA
from sklearn.cluster import KMeans as KMeans

In [109]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/swethapola/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [140]:
def preprocessor(data):
    
    # TfidfVectorizer puts all letters in lowercase by default
    data = data.lower()

    # sub hyphens with no space
    data = re.sub("\-", '', data)
    
    # sub non letter & non digit characters w/ a space
    data = re.sub("[^A-Za-z0-9 \\n]","", data)
    
    # sub digit characters with '#' character
    #data = re.sub("[\d]", "\#", data) 
    
    #removing stop words
    word_tokens = word_tokenize(data)
    sw = stopwords.words("english")
    filtered_sentence = []
    for w in word_tokens: 
        if w not in sw: 
            filtered_sentence.append(w) 
    
    #turn list of tokens back into str
    data = ' '.join(filtered_sentence)
    
    # shorten long words of length 20 or more
    data = re.sub('(\w{20})\w+', '\\1', data) #mainly gets rid of hyperlinks
    
    
    # stemming words
    #stem = PorterStemmer()
    #for word in data:
     #   data = data.replace(word, stem.stem(word))
    
    return data

In [145]:
def tvect(data):
    vect = TfidfVectorizer(min_df = 2, ngram_range = (2,4))
    fitted = vect.fit_transform(data)
    return fitted, vect.get_feature_names()

def get_polarity(text):
    blob = TextBlob(text)
    return blob.sentiment.polarity

In [146]:
def featurize(csv, path):
    #read in csv & make a df
    df = pd.read_csv(path + csv)
    df = df.drop("Unnamed: 0", axis = 1)


    df = df.rename(columns={'0': 'text'})
    
    #add polarity column
    df['polarity'] = df['text'].apply(get_polarity)
    
    #add preprocessed column
    df['preprocess_full_text'] = df.text.apply(lambda x: preprocessor(x))
    
    #add feature names from TfidfVectorizer
    featurized = tvect(df.preprocess_full_text)
    df1 = pd.DataFrame(featurized[0].toarray(), columns=featurized[1])
    df = pd.concat([df, df1], axis=1)
    
    
    return df, featurized[1]


In [150]:
w = featurize('AskWomen_Data.csv' , 'data/')
w_df = w[0]
w_features = w[1]

In [152]:
m = featurize('AskMen_Data.csv' , 'data/')
m_df = m[0]
m_features = m[1]

In [202]:
w_df["outcome"] = 1
m_df["outcome"] = 0

In [209]:
df_nn = pd.concat([w_df[["text", "outcome"]], m_df[["text", "outcome"]]])
df_nn.to_csv("data/NN_Data.csv")

# PCA AND CLUSTERING

In [156]:
intersection = set(w_features).intersection(set(m_features))

len(intersection)


1776

In [157]:
len(w_features)

20606

In [158]:
len(m_features)

15958

In [None]:
intersection

In [190]:
# build the full df for clustering
full_df = pd.concat([w_df, m_df])
df = full_df.drop(columns = ['text', 'preprocess_full_text'])

In [191]:
df = df.fillna(0)

In [192]:
# create an instance of the PCA class
pca = PCA(n_components=10)

# fit the data using the original X_train_std data; tranform X_train_std data
pca_df = pca.fit_transform(df)

# transform the X_test_std data
#X_test_pca_skl = pca.transform(X_test_std)

In [189]:
kmeans = KMeans(n_clusters=2, init='k-means++')

kmeans.fit(pca_df)

kmeans.labels_




array([1, 1, 1, ..., 1, 0, 0], dtype=int32)

In [193]:
full_df['outcome'] = kmeans.labels_

# LOG REG

In [221]:
full_df.head()

Unnamed: 0,text,polarity,preprocess_full_text,10 hours,10 minutes,10 minutes without,10 minutes without rest,10 yawned,10 yawned walked,10 yawned walked slowly,...,youve already rubs leg,youve made,youve made right,youve made right decision,youve meaning,youve meaning activity,youve meaning activity occupy,yr old,yrs old,outcome
0,My dad had a heart attack and spent over 7 min...,0.038258,dad heart attack spent 7 minutes without oxyge...,0.317523,0.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,1
1,I had dangerous open heart surgery due to mult...,-0.072619,dangerous open heart surgery due multiple cong...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,1
2,"In June of 2020, my Dad collapsed in our yard ...",0.054932,june 2020 dad collapsed yard helping us mow la...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,1
3,"after my twin aunts were born, my grandma beca...",0.145833,twin aunts born grandma became severely ill cl...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,0
4,Not me but my mother is the most severe case I...,-0.192593,mother severe case think around 25 started reg...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,1


In [281]:
full_df = full_df.fillna(0)
X = full_df.drop(["outcome", "text", "preprocess_full_text", "polarity"], axis = 1)
y = full_df["outcome"]

In [302]:
X_train, X_test, y_train, y_test = train_test_split(pd.DataFrame(pca_df), y, test_size=0.3, random_state=1, stratify=y)

In [303]:
model = linear_model.LogisticRegressionCV(max_iter = 200)

In [304]:
model.fit(X_train, y_train, sample_weight=None)

LogisticRegressionCV(max_iter=200)

In [305]:
y_pred = model.predict(X_test)

In [306]:
print('Prediction accuracy: %3.2f' % model.score(X_test, y_test))

Prediction accuracy: 1.00


In [307]:
model.coef_[0]

array([107.88069127,   3.146667  ,   2.32769269,  -0.20959839,
        -0.38610626,   0.14997508,   2.65952681,  -0.26670778,
        -3.46006762,  -0.25306952])

In [308]:
pd.DataFrame(pca_df)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,0.058245,0.124470,0.004593,0.004606,-0.001428,-0.002925,-0.013000,-0.007425,-0.008437,0.002804
1,0.172219,-0.023360,-0.016351,-0.017430,-0.012399,-0.017354,-0.104065,0.581251,0.013932,0.226547
2,0.042163,0.069368,0.006113,-0.002261,-0.005021,-0.005639,0.008598,-0.001455,0.000196,-0.005946
3,-0.057476,-0.034927,0.110671,0.039755,0.006314,0.000046,0.004069,-0.004665,-0.037008,0.001673
4,0.290915,0.009349,-0.011545,-0.005327,-0.002774,-0.004974,-0.017203,-0.010485,-0.005979,0.003551
...,...,...,...,...,...,...,...,...,...,...
2783,0.130871,-0.020700,-0.014323,-0.011312,-0.007607,-0.007407,0.004235,-0.006996,-0.001460,0.006745
2784,0.047404,-0.026362,0.019660,-0.003790,-0.007309,-0.013120,-0.033032,0.063154,0.092240,-0.043991
2785,0.145857,-0.031994,0.022421,-0.002565,-0.010083,-0.007889,-0.078460,0.044380,0.258573,-0.047233
2786,-0.029578,-0.014583,-0.019501,-0.013553,0.005396,-0.005396,-0.003238,-0.003669,-0.011232,-0.003884


In [309]:
coef_table = pd.DataFrame(pca_df)
coef_table.insert(len(coef_table.columns),"Coefs", model.coef_.transpose())

ValueError: Length of values (10) does not match length of index (2788)

NameError: name 'tf' is not defined

In [276]:
coef_table.sort_values("Coefs")


Unnamed: 0,0,Coefs
0,polarity,-40.884744
28465,means something,-2.235309
13054,one time,-2.218298
25925,guys feel,-2.211709
8894,hes ever,-2.206047
...,...,...
8332,heart attack,1.694961
5292,dont anything,1.722159
3464,chest pain,1.747430
27905,long go,1.955540
