In [None]:
import pandas as pd
import numpy as np

import datetime

# import json
import re
import string

import collections

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.svm import LinearSVC
from sklearn import metrics
from sklearn import preprocessing
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score


from nltk.stem.porter import PorterStemmer

import os
import re
from collections import Counter

# import en_core_web_sm

import math
from numpy.linalg import norm
import copy
from scipy.optimize import minimize

from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
import string

from itertools import chain

In [None]:
def computeTFIDFVector(dct_igm, review):
    # Create a list of unique words
    wordDict = sorted(dct_igm.keys())
    tfidfVector = [0.0] * len(wordDict)
    
  # For each unique word, if it is in the review, store its TF-IDF value.
    for i, word in enumerate(wordDict):
        if word in review:
            tfidfVector[i] = review[word]
    return tfidfVector

def computeCountDict(list_tf):
    
    """ Returns a dictionary whose keys are all the unique words in
    the dataset and whose values count the number of reviews in which
    the word appears.
    """
    countDict = {}
    # Run through each review's tf dictionary and increment countDict's (word, doc) pair
    for review in list_tf:
        for word in review:
            if word in countDict:
                countDict[word] += 1
            else:
                countDict[word] = 1
    return countDict

def computeReviewTFDict(review):
    """ Returns a tf dictionary for each review whose keys are all
    the unique words in the review and whose values are their
    corresponding tf.
    """
    # Counts the number of times the word appears in review
    reviewTFDict = {}
    for word in review:
        if word in reviewTFDict:
            reviewTFDict[word] += 1
        else:
            reviewTFDict[word] = 1
    # Computes tf for each word
    for word in reviewTFDict:
        reviewTFDict[word] = reviewTFDict[word] / len(review)
    return reviewTFDict

def map_dict(dct_igm, review):
    reviewTFDict = {}
    output = list(map(dct_igm.get, review.keys()))
    output = [0 if v is None else v for v in output]
    output = [v/np.log(len(output)) for v in output]
    
    reviewTFDict = dict(zip(review.keys(), output))
    return reviewTFDict

def adj_noun_merger(doc):
    offset = 0
    while offset < len(doc) - 3:
        if doc[offset].pos_ in ["ADJ", "NOUN"] and doc[offset+1].pos_ == "NOUN":
            start = doc[offset].i
            if doc[offset+2].pos_ == 'NOUN':
                if doc[offset+3].pos_ == 'NOUN': end = doc[offset+3].i
                else: end = doc[offset+2].i 
            else: end = doc[offset+1].i
            with doc.retokenize() as retokenizer:
                retokenizer.merge(doc[start:end+1], attrs={"POS" : "NOUN"}) 
        offset += 1
        
def get_counters(left, right):
    from collections import Counter
    base_dict = {token : 1 for token in list(set(left + right))}
    left_counts, right_counts = Counter(left), Counter(right)
    
    left_dict, right_dict = base_dict.copy(), base_dict.copy()
    left_dict.update(left_counts)
    right_dict.update(right_counts)
    
    return left_dict, right_dict

def topK(beta,vocab,K=10):
    return [vocab[idx] for idx in (-beta).argsort()[:K]]

# import SAGE

def get_keywords(eta, vocab, min_len = 2, remove = []):
    keywords = [i[0] for i in topK(eta,vocab,len(vocab)) if i[1] == 'NOUN' and len(i[0].split(' ')) >= min_len and not any(c in string.digits for c in i[0])]
    for word in remove:
        try: keywords.remove(word)
        except ValueError: print(f'{word} not in keywords')
    return keywords

class DeltaIterator:
    def __init__(s,max_its=100,thresh=1e-4,debug=False):
        s.thresh = thresh
        s.max_its = max_its
        s.prev = None
        s.done = False
        s.its = 0
        s.debug = debug

    def update(s,x):
        if s.prev is not None:
            change = norm(x - s.prev) / (1e-6+norm(x))
            if s.debug: print(s.its,'/',s.max_its,change)
            if change < s.thresh: s.done = True
        s.its += 1
        if s.its > s.max_its: s.done = True
        s.prev = copy.copy(x)

def estimate(ecounts,eq_m,max_its=25):
    if len(ecounts.shape)==1:
        ecounts = np.reshape(ecounts,(-1,1))
    [W,K] = ecounts.shape
    eta = np.zeros(W)
    eq_inv_tau = np.ones(W)
    exp_eq_m = np.exp(eq_m)
    max_inv_tau = 1e5
    it = DeltaIterator(debug=False,max_its=max_its,thresh=1e-4)
    while not(it.done):
        fLogNormal = lambda x : fLogNormalAux(x,ecounts,exp_eq_m,eq_inv_tau)
        gLogNormal = lambda x : gLogNormalAux(x,ecounts,exp_eq_m,eq_inv_tau)
        min_out = minimize(fLogNormal,eta,method='L-BFGS-B',jac=gLogNormal,options={'disp':False})
        #TODO:
        #hpLogNormal = lambda x : hpLogNormalAux(x,ecounts,exp_eq_m,eq_inv_tau)
        #min_out = minimize(fLogNormal,eta,method='Newton-CG',jac=gLogNormal,options={'disp':True})
        eta = min_out.x
        eq_inv_tau = 1/(eta**2)
        eq_inv_tau[eq_inv_tau > max_inv_tau] = max_inv_tau
        it.update(eta)
    return(eta)

def fLogNormalAux(eta,ecounts,exp_eq_m,eq_inv_tau):
    C = ecounts.sum(axis=0)
    [W,K] = ecounts.shape
    denom = np.tile(np.exp(eta),(K,1)).dot(exp_eq_m.T)
    out = -(eta.T.dot(ecounts).sum(axis=0) - C * np.log(denom.sum(axis=0)) - 0.5 * eq_inv_tau.T.dot(eta ** 2))
    return(out[0])
           
def gLogNormalAux(eta,ecounts,exp_eq_m,eq_inv_tau):
    C = ecounts.sum(axis=0)
    [W,K] = ecounts.shape
    denom = np.tile(np.exp(eta),(K,1)) * exp_eq_m
    denom_norm = (denom.T / denom.sum(axis=1))
    beta = C * denom_norm / (C + 1e-10)
    g = -(ecounts.sum(axis=1) - beta.dot(C) - eq_inv_tau * eta)
    return(g)

In [None]:
data = pd.read_csv('data_processed.csv', lineterminator='\n')

In [None]:
bins_gender = pd.read_excel('gender.top100.1to3grams_cleaned.xls')

In [None]:
bins_women = list(bins_gender.iloc[:,0])
bins_men = list(bins_gender.iloc[:,1])

In [None]:
bins_men_clean = []
for i in bins_men:
    bins_men_clean.append(str(i))

In [None]:
count_bins_women = []
count_bins_men = []


for tweet in data['clean_tweets']:
    
    count_bins_women.append(sum(tweet.count(x) for x in bins_women))
    count_bins_men.append(sum(tweet.count(x) for x in bins_men_clean))
    
    
    if len(count_bins_men) % 100 == 0:
        print(len(count_bins_men))

In [None]:
data['count_bins_women_wwbp'] = count_bins_women
data['count_bins_men_wwbp'] = count_bins_men

In [None]:
import gender_guesser.detector as gender

def detect_gender(names):
    first_names = []
    names = names.astype(str)
    pred_names = []
    for i in names:
        i = i.split(" ", 1)
        first_names.append(i[0])
    d = gender.Detector()
    genders = []
    for i in first_names:
        genders.append(d.get_gender(i.lower().title()))
        pred_names.append(i.lower().title())
    return pred_names, genders

In [None]:
preds = detect_gender(data['AccountNaam'])

In [None]:
data['Gender'] = preds[1]

In [None]:
data_gender = data[(data['Gender'] == 'male') | (data['Gender'] == 'female')].reset_index().drop('index', axis=1) #['Gender'].value_counts()

In [None]:
data_gender['Gender'].value_counts()

In [None]:
data_gender.shape

In [None]:
data_gender['abbr_count'] = data_gender['abbr_count']*data_gender['tweet_length']


In [None]:
data_gender[data_gender['Gender']=='female']['abbr_count'].mean()

In [None]:
data_gender[data_gender['Gender']=='female']['abbr_count'].std()

In [None]:

data['capital_count'] = data['capital_count']/data['tweet_length']
data['punctuation_count'] = data['punctuation_count']/data['tweet_length']

data['emoji_count'] = data['emoji_count']/data['tweet_length']
data['laugh_count'] = data['laugh_count']/data['tweet_length']
data['swear_count'] = data['swear_count']/data['tweet_length']
data['diff_words_count'] = data['diff_words_count']/data['tweet_length']
data['abbr_count'] = data['abbr_count']/data['tweet_length']

data['count_13_18_pos'] = data['count_13_18_pos']/data['tweet_length']
data['count_13_18_neg'] = data['count_13_18_neg']/data['tweet_length']
data['count_19_22_pos'] = data['count_19_22_pos']/data['tweet_length']
data['count_19_22_neg'] = data['count_19_22_neg']/data['tweet_length']
data['count_23_29_pos'] = data['count_23_29_pos']/data['tweet_length']

data['count_23_29_neg'] = data['count_23_29_neg']/data['tweet_length']
data['count_30_pos'] = data['count_30_pos']/data['tweet_length']
data['count_30_neg'] = data['count_30_neg']/data['tweet_length']
data['literal_emojis'] = data['literal_emojis']/data['tweet_length']

In [None]:
features = data_gender[['AccountTweets', 'AccountFollowers', 'AccountFollowing', 'rt_count', 'urls_count',
       'capital_count', 'tweet_length', 'punctuation_count', 'hashtag_count', 
       'emoji_count', 'laugh_count', 'swear_count',
       'diff_words_count', 'abbr_count', 'count_13_18_pos',
       'count_13_18_neg', 'count_19_22_pos', 'count_19_22_neg',
       'count_23_29_pos', 'count_23_29_neg', 'count_30_pos', 'count_30_neg',
       'nnp', 'nn', 'rb', 'in', 'fw', 'nnps', 'vbp', 'cd', 'vbd', 'md', 'vb',
       'vbg', 'vbz', 'rbr', 'vbn', 'jjs', 'cc',
       'site', 'creation_date', 'literal_emojis', 
                     'emoji0', 'count_bins_women_wwbp', 'count_bins_men_wwbp',
 'emoji1',
 'emoji2',
 'emoji3',
 'emoji4',
 'emoji5',
 'emoji6',
 'emoji7',
 'emoji8',
 'emoji9',
 'emoji10',
 'emoji11',
 'emoji12',
 'emoji13',
 'emoji14',
 'emoji15',
 'emoji16',
 'emoji17',
 'emoji18',
 'emoji19',
 'emoji20',
 'emoji21',
 'emoji22',
 'emoji23',
 'emoji24',
 'emoji25',
 'emoji26',
 'emoji27',
 'emoji28',
 'emoji29',
 'emoji30',
 'emoji31',
 'emoji32',
 'emoji33',
 'emoji34',
 'emoji35',
 'emoji36',
 'emoji37',
 'emoji38',
 'emoji39',
 'emoji40',
 'emoji41',
 'emoji42',
 'emoji43',
 'emoji44',
 'emoji45',
 'emoji46',
 'emoji47',
 'emoji48',
 'emoji49',
 'emoji50',
 'emoji51',
 'emoji52',
 'emoji53',
 'emoji54',
 'emoji55',
 'emoji56',
 'emoji57',
 'emoji58',
 'emoji59',
 'emoji60',
 'emoji61',
 'emoji62',
 'emoji63',
 'emoji64',
 'emoji65',
 'emoji66',
 'emoji67',
 'emoji68',
 'emoji69',
 'emoji70',
 'emoji71',
 'emoji72',
 'emoji73',
 'emoji74',
 'emoji75',
 'emoji76',
 'emoji77',
 'emoji78',
 'emoji79',
 'emoji80',
 'emoji81',
 'emoji82',
 'emoji83',
 'emoji84',
 'emoji85',
 'emoji86',
 'emoji87',
 'emoji88',
 'emoji89',
 'emoji90',
 'emoji91',
 'emoji92',
 'emoji93',
 'emoji94',
 'emoji95',
 'emoji96',
 'emoji97',
 'emoji98',
 'emoji99']]
labels = data_gender['Gender']

In [None]:
normalized_features = preprocessing.normalize(features)
normalized_features_df = pd.DataFrame(normalized_features)
normalized_features_df.columns = features.columns

In [None]:
X_train, X_test, y_train, y_test = train_test_split(normalized_features_df, labels, test_size=0.2)

In [None]:
from xgboost import XGBClassifier
model = XGBClassifier(n_estimators=100, max_depth = 7, learning_rate=0.05)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
conf_mat = confusion_matrix(y_test, y_pred)
print(metrics.classification_report(y_test,y_pred))

In [None]:
scorings = ['accuracy', 'f1_macro', 'f1_micro', 'precision', 'recall']
from xgboost import XGBClassifier
model = XGBClassifier(n_estimators=100, max_depth = 7, learning_rate=0.05)
cross_validate(model, X_train, y_train, scoring=scorings, cv=5)

In [None]:
import pandas as pd
feature_importances = pd.DataFrame(model.feature_importances_,
                               index = X_train.columns,
                      columns=['importance']).sort_values('importance',ascending=False)

In [None]:
import xgboost as xgb
xgb.plot_importance(model, max_num_features = 10, importance_type='gain')

In [None]:
### IGM

data_new = data_gender[['Volgendaccount', 'Gender']]

following_ids = data_new['Volgendaccount']
following_ids0 = data_new[data_new['Gender']=='male']['Volgendaccount']
following_ids1 = data_new[data_new['Gender']=='female']['Volgendaccount']
# following_ids2 = data_new[data_new['Gender']==2]['Volgendaccount']

data0 = [
    [(word.replace(",", "")
          .replace(".", "")
          .replace("(", "")
          .replace(")", "")
          .replace("]", "")
          .replace("[", ""))
    for word in row.lower().split()]
    for row in following_ids0]

data1 = [
    [(word.replace(",", "")
          .replace(".", "")
          .replace("(", "")
          .replace(")", "")
          .replace("]", "")
          .replace("[", ""))
    for word in row.lower().split()]
    for row in following_ids1]

# data2 = [
#     [(word.replace(",", "")
#           .replace(".", "")
#           .replace("(", "")
#           .replace(")", "")
#           .replace("]", "")
#           .replace("[", ""))
#     for word in row.lower().split()]
#     for row in following_ids2]

data_all = [
    [(word.replace(",", "")
          .replace(".", "")
          .replace("(", "")
          .replace(")", "")
          .replace("]", "")
          .replace("[", ""))
    for word in row.lower().split()]
    for row in following_ids]

list_tf = []
list_tf0 = []
list_tf1 = []
# list_tf2 = []
for i in data_all:
    list_tf.append(computeReviewTFDict(i))
for i in data0:
    list_tf0.append(computeReviewTFDict(i))
for i in data1:
    list_tf1.append(computeReviewTFDict(i))
# for i in data2:
#     list_tf2.append(computeReviewTFDict(i))
    
countDict0 = computeCountDict(list_tf0)
countDict1 = computeCountDict(list_tf1)
# countDict2 = computeCountDict(list_tf2)
    
df_counts0 = pd.DataFrame.from_dict(countDict0, orient='index')
df_counts1 = pd.DataFrame.from_dict(countDict1, orient='index')
# df_counts2 = pd.DataFrame.from_dict(countDict2, orient='index')

df = pd.concat([df_counts0, df_counts1], axis=1, sort=False) # df_counts2

df = df.fillna(0)
df.columns = ['counts0', 'counts1'] #counts2
df['sum'] = df.sum(axis=1)
df['max'] = df[['counts0', 'counts1']].max(axis=1) # counts2

max_following = 1 # This is 10 in the study

df = df[df['counts0'] >= max_following]
df = df[df['counts1'] >= max_following]
# df = df[df['counts2'] >= 5]

df['igm'] = df['max']/df['sum']

In [None]:
df[df['counts0'] == df['max']].sort_values('igm', ascending=False)

In [None]:
df.shape

In [None]:
# 461922813 => @Ibra_official

# 50323173 => @wojespn

# 28870086 => @mortreport

# 15332636 => @talkSPORT

# 2835653131 => @miakhalifa

In [None]:
# 243098820 => @ItsGaryTime

# 386370914 => @ryancedwards

# 15613133 => @Smashbox

# 2391359107 => @AmberLPortwood

# 251706984 => @PBandJenelley_1

In [None]:
dct_igm = dict(zip(df.index,df['igm']))
for i in dct_igm:
    if i == 1.0:
        i = 0
        
users_mapped = []
for i in list_tf:
    users_mapped.append(map_dict(dct_igm, i))
    
igm_result = [computeTFIDFVector(dct_igm, account_id) for account_id in users_mapped]

In [None]:
igm_array = np.array(igm_result)
igm_df = pd.DataFrame(igm_array)
normalized_igm = preprocessing.normalize(igm_df)
normalized_igm_df = pd.DataFrame(normalized_igm)

In [None]:
igm_features_normalized=pd.concat([normalized_igm_df,normalized_features_df], axis=1)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(igm_features_normalized, labels, test_size=0.2)

In [None]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=5000,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False, random_state=0)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
conf_mat = confusion_matrix(y_test, y_pred)
print(metrics.classification_report(y_test,y_pred))

In [None]:
scorings = ['accuracy', 'f1_macro', 'f1_micro', 'precision', 'recall']
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=5000,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False, random_state=0)
cross_validate(model, X_train, y_train, scoring=scorings, cv=5)

In [None]:
all_tweets = []
for tweets in data_gender['Tweets']:
    rt = 0
    tweets_user = []
    for i in range(len(tweets.split("\'"))):
        tweet_fragment = tweets.split("\'")[i]
        if tweet_fragment.count('RT') != 0:
            x = 1
        else:
            tweets_user.append(tweet_fragment)
    all_tweets.append(' '.join(tweets_user))
    
    if len(all_tweets) % 1000 == 0:
        print(len(all_tweets))

In [None]:
clean_tweets = []

for tweet in all_tweets:
    
    tweet_no_link = re.sub(r'http\S+', '', tweet)
    tweet_stripped = tweet_no_link.lower().translate(str.maketrans('', '', '!"$%&\'()*+,-./:;<=>?[\\]^_`{|}~')).strip()
    clean_tweet = re.sub(' +', ' ', tweet_stripped)
    
    clean_tweets.append(clean_tweet)
    
    if len(clean_tweets) % 1000 == 0:
        print(len(clean_tweets))

In [None]:
final_tweets = []
for tweet in clean_tweets:
    no_handle = re.sub('(?:\s)@[^, ]*', '', tweet)
    no_hashtag = re.sub('(?:\s)#[^, ]*', '', no_handle)
    final_tweets.append(no_hashtag)
    
    if len(final_tweets) % 1000 == 0:
        print(len(final_tweets))

In [None]:
data_gender['sage_tweets'] = final_tweets

In [None]:
stemmer = PorterStemmer()
tweets_stemmed = []
for tweet in data_gender['sage_tweets']:
    tokenAux=""
    textAux=""
    tokens = tweet.split()
    for token in tokens:
        tokenAux = token
        tokenAux = stemmer.stem(token)    
        textAux = textAux + " "+ tokenAux
    tweets_stemmed.append(textAux)
    
    if len(tweets_stemmed) % 1000 == 0:
        print(len(tweets_stemmed))

In [None]:
data_gender['tweets_stemmed'] = tweets_stemmed

In [None]:
#### SAGE

# Hier staat het dus goed, tweets
tweets_male = data_gender[data_gender['Gender'] == 'male']['tweets_stemmed']
tweets_female = data_gender[data_gender['Gender'] == 'female']['tweets_stemmed']

In [None]:
tweets_male = tweets_male.str.cat(sep=' ')
tweets_female = tweets_female.str.cat(sep=' ')

In [None]:
tweets_male = tweets_male.lower()
tweets_female = tweets_female.lower()

In [None]:
tokens_male = word_tokenize(tweets_male)
tokens_female = word_tokenize(tweets_female)

In [None]:
male_counter, male_random_counter = get_counters(tokens_male, tokens_female)
vocab_male = [word for word,count in Counter(male_counter).most_common(2000)]

female_counter, female_random_counter = get_counters(tokens_female, tokens_male)
vocab_female = [word for word,count in Counter(female_counter).most_common(2000)]

In [None]:
x_male = np.array([male_counter[word] for word in vocab_male])
x_female = np.array([female_counter[word] for word in vocab_female])

x_random_male = np.array([male_random_counter[word] for word in vocab_male]) + 1.
x_random_female = np.array([female_random_counter[word] for word in vocab_female]) + 1.

mu_male = np.log(x_random_male) - np.log(x_random_male.sum())
mu_female = np.log(x_random_female) - np.log(x_random_female.sum())

In [None]:
eta_male = estimate(x_male,mu_male)
eta_female = estimate(x_female,mu_female)

In [None]:
dct_male= dict(zip(vocab_male,eta_male))
dct_female= dict(zip(vocab_female,eta_female))

In [None]:
all_words = []
for i in data_gender['sage_tweets']:
    tokens_user = word_tokenize(str(i))
    word_dct = {}
    for word in tokens_user:
        word_dct[word] = 0
    all_words.append(word_dct)

In [None]:
words_mapped_male = []
for i in all_words:
    words_mapped_male.append(map_dict(dct_male, i))
    
words_mapped_female = []
for i in all_words:
    words_mapped_female.append(map_dict(dct_female, i))

In [None]:
sage_result_male = [computeTFIDFVector(dct_male, user) for user in words_mapped_male]
sage_result_female = [computeTFIDFVector(dct_female, user) for user in words_mapped_female]

In [None]:
array_male = np.array(sage_result_male)
array_female = np.array(sage_result_female)

In [None]:
combined = np.column_stack([array_male, array_female])

In [None]:
df_sage = pd.DataFrame(combined)
indices_to_keep_tweets = ~df_sage.isin([np.nan, np.inf, -np.inf]).any(1)
df_sage_clean = df_sage[indices_to_keep_tweets].astype(np.float64)
normalized_tweets = preprocessing.normalize(df_sage_clean)
normalized_tweets_df = pd.DataFrame(normalized_tweets)

In [None]:
labels_clean = pd.DataFrame(labels).reset_index()[indices_to_keep_tweets]['Gender']

In [None]:
igm_features_normalized_kept = igm_features_normalized[indices_to_keep_tweets]

In [None]:
igm_features_normalized_kept.index = normalized_tweets_df.index 

#option 2
#index of t starts from 0
normalized_tweets_df.reset_index(drop=True, inplace=True)

#now concat will keep number of rows 
all_features=pd.concat([normalized_tweets_df,igm_features_normalized_kept], axis=1)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(all_features, labels_clean, test_size=0.2)

In [None]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=1000,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False, random_state=0)# , 
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
conf_mat = confusion_matrix(y_test, y_pred)
print(metrics.classification_report(y_test,y_pred))

In [None]:
matrix = confusion_matrix(y_test, y_pred)
matrix.diagonal()/matrix.sum(axis=1)

In [None]:
scorings = ['accuracy', 'f1_macro', 'f1_micro', 'precision', 'recall']
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=5000,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False, random_state=0)
cross_validate(model, X_train, y_train, scoring=scorings, cv=5)

In [None]:
all_features_new = all_features

In [None]:
all_features_new['labels'] = labels_clean

In [None]:
all_features.shape

In [None]:
# f1_scores = []
# for i in range(1000, 6000, 500):
#     sample = all_features_new.sample(i)
    
# #     X_train, X_test, y_train, y_test = train_test_split(sample.iloc[:,:(len(sample.columns)-1)], sample['labels'], test_size=0.2)
    

#     model = LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
#                        intercept_scaling=1, l1_ratio=None, max_iter=1000,
#                        multi_class='auto', n_jobs=None, penalty='l2',
#                        solver='lbfgs', tol=0.0001, verbose=0,
#                        warm_start=False, random_state=0)# , 
    
#     scores = cross_val_score(model, sample.iloc[:,:(len(sample.columns)-1)], sample['labels'], cv=5, scoring='f1_macro')
#     f1_scores.append(scores.mean())
# #     y_pred = model.predict(X_test)
# #     conf_mat = confusion_matrix(y_test, y_pred)
# #     f1_scores.append(metrics.f1_score(y_test,y_pred, average='macro'))

In [None]:
# 0.8004743500179003,
#  0.8065622227425413,
#  0.8084215280400642,
#  0.8214528704084145,
#  0.833778966773781,
#  0.8320699144344921,
#  0.8255984333414521,
#  0.828508465992971,
#  0.8306102667022429,
#  0.8355304024973392

In [None]:
topK(eta_male, vocab_male, K=50)

In [None]:
topK(eta_female, vocab_female, K=50)