In [1]:
import pandas as pd  
import numpy as np
import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')

%matplotlib inline
%config InlineBackend.figure_format = 'retina'

In [2]:
csv = 'clean_tweet.csv'
my_df = pd.read_csv(csv,index_col=0)
my_df.head()

Unnamed: 0,text,account_type
0,we have sitting democrat us senator on trial f...,Right
1,marshawn lynch arrives to game in anti trump s...,Right
2,daughter of fallen navy sailor delivers powerf...,Right
3,just in president trump dedicates presidents c...,Right
4,respecting our national anthem standforouranthem,Right


In [3]:
my_df.dropna(inplace=True)
my_df.reset_index(drop=True,inplace=True)
my_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 202315 entries, 0 to 202314
Data columns (total 2 columns):
text            202315 non-null object
account_type    202315 non-null object
dtypes: object(2)
memory usage: 3.1+ MB


In [4]:
x = my_df.text
y = my_df.account_type

In [6]:
#from sklearn.cross_validation import train_test_split
from sklearn.model_selection import train_test_split
SEED = 2000
x_train, x_validation_and_test, y_train, y_validation_and_test = train_test_split(x, y, test_size=.02, random_state=SEED)
x_validation, x_test, y_validation, y_test = train_test_split(x_validation_and_test, y_validation_and_test, test_size=.5, random_state=SEED)

In [7]:
from sklearn.linear_model import LogisticRegression

In [10]:
from tqdm import tqdm
#tqdm.pandas(desc="progress-bar")
from gensim.models import Doc2Vec
from gensim.models.doc2vec import LabeledSentence
import multiprocessing
from sklearn import utils

In [11]:
def labelize_tweets_ug(tweets,label):
    result = []
    prefix = label
    for i, t in zip(tweets.index, tweets):
        result.append(LabeledSentence(t.split(), [prefix + '_%s' % i]))
    return result

In [12]:
all_x = pd.concat([x_train,x_validation,x_test])
all_x_w2v = labelize_tweets_ug(all_x, 'all')

  """


In [13]:
len(all_x_w2v)

202315

#### Distributed Bag of Words

In [15]:
model_ug_dbow = Doc2Vec(dm=0, size=100, negative=5, min_count=2, workers=1, alpha=0.065, min_alpha=0.065)
model_ug_dbow.build_vocab([x for x in tqdm(all_x_w2v)])

100%|██████████| 202315/202315 [00:00<00:00, 688926.39it/s]


In [16]:
%%time
for epoch in range(30):
    model_ug_dbow.train(utils.shuffle([x for x in tqdm(all_x_w2v)]), total_examples=len(all_x_w2v), epochs=1)
    model_ug_dbow.alpha -= 0.002
    model_ug_dbow.min_alpha = model_ug_dbow.alpha

100%|██████████| 202315/202315 [00:00<00:00, 1904217.23it/s]
100%|██████████| 202315/202315 [00:00<00:00, 2445455.50it/s]
100%|██████████| 202315/202315 [00:00<00:00, 2256260.67it/s]
100%|██████████| 202315/202315 [00:00<00:00, 2453338.73it/s]
100%|██████████| 202315/202315 [00:00<00:00, 1208440.60it/s]
100%|██████████| 202315/202315 [00:00<00:00, 1868353.40it/s]
100%|██████████| 202315/202315 [00:00<00:00, 1977029.26it/s]
100%|██████████| 202315/202315 [00:00<00:00, 2205551.79it/s]
100%|██████████| 202315/202315 [00:00<00:00, 981372.94it/s]
100%|██████████| 202315/202315 [00:00<00:00, 2414835.03it/s]
100%|██████████| 202315/202315 [00:00<00:00, 2255085.45it/s]
100%|██████████| 202315/202315 [00:00<00:00, 2352116.21it/s]
100%|██████████| 202315/202315 [00:00<00:00, 2320223.26it/s]
100%|██████████| 202315/202315 [00:00<00:00, 2459931.57it/s]
100%|██████████| 202315/202315 [00:00<00:00, 2226395.52it/s]
100%|██████████| 202315/202315 [00:00<00:00, 2608506.42it/s]
100%|██████████| 202315/2

CPU times: user 4min 23s, sys: 4.8 s, total: 4min 28s
Wall time: 5min 11s


In [17]:
def get_vectors(model, corpus, size):
    vecs = np.zeros((len(corpus), size))
    n = 0
    for i in corpus.index:
        prefix = 'all_' + str(i)
        vecs[n] = model.docvecs[prefix]
        n += 1
    return vecs

In [18]:
train_vecs_dbow = get_vectors(model_ug_dbow, x_train, 100)
validation_vecs_dbow = get_vectors(model_ug_dbow, x_validation, 100)

In [19]:
clf = LogisticRegression()
clf.fit(train_vecs_dbow, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [20]:
clf.score(validation_vecs_dbow, y_validation)

0.7345526445872467

In [21]:
model_ug_dbow.save('d2v_model_ug_dbow.doc2vec')
model_ug_dbow = Doc2Vec.load('d2v_model_ug_dbow.doc2vec')

In [22]:
model_ug_dbow.delete_temporary_training_data(keep_doctags_vectors=True, keep_inference=True)

#### Distributed memory - concatenated

In [24]:
cores = multiprocessing.cpu_count()
model_ug_dmc = Doc2Vec(dm=1, dm_concat=1, size=100, window=2, negative=5, min_count=2, workers=cores, alpha=0.065, min_alpha=0.065)
model_ug_dmc.build_vocab([x for x in tqdm(all_x_w2v)])

100%|██████████| 202315/202315 [00:00<00:00, 1659695.18it/s]


In [25]:
%%time
for epoch in range(30):
    model_ug_dmc.train(utils.shuffle([x for x in tqdm(all_x_w2v)]), total_examples=len(all_x_w2v), epochs=1)
    model_ug_dmc.alpha -= 0.002
    model_ug_dmc.min_alpha = model_ug_dmc.alpha

100%|██████████| 202315/202315 [00:00<00:00, 1165449.28it/s]
100%|██████████| 202315/202315 [00:00<00:00, 2462422.85it/s]
100%|██████████| 202315/202315 [00:00<00:00, 1708282.65it/s]
100%|██████████| 202315/202315 [00:00<00:00, 2101405.39it/s]
100%|██████████| 202315/202315 [00:00<00:00, 2384663.59it/s]
100%|██████████| 202315/202315 [00:00<00:00, 1283203.03it/s]
100%|██████████| 202315/202315 [00:00<00:00, 2321702.39it/s]
100%|██████████| 202315/202315 [00:00<00:00, 2246417.19it/s]
100%|██████████| 202315/202315 [00:00<00:00, 2207956.34it/s]
100%|██████████| 202315/202315 [00:00<00:00, 2408399.36it/s]
100%|██████████| 202315/202315 [00:00<00:00, 2198426.42it/s]
100%|██████████| 202315/202315 [00:00<00:00, 2320953.07it/s]
100%|██████████| 202315/202315 [00:00<00:00, 2465635.40it/s]
100%|██████████| 202315/202315 [00:00<00:00, 399427.72it/s]
100%|██████████| 202315/202315 [00:00<00:00, 1412596.30it/s]
100%|██████████| 202315/202315 [00:00<00:00, 2163628.10it/s]
100%|██████████| 202315/2

CPU times: user 8min 8s, sys: 1min 4s, total: 9min 13s
Wall time: 7min 13s


In [27]:
#model_ug_dmc = Doc2Vec.load('d2v_model_ug_dmc.doc2vec')

In [28]:
model_ug_dmc.most_similar('good')


  """Entry point for launching an IPython kernel.


[('bad', 0.6211588382720947),
 ('surcharge', 0.5239689350128174),
 ('sad', 0.5197510719299316),
 ('interesting', 0.5149270296096802),
 ('predictable', 0.5012583136558533),
 ('excellent', 0.5000801682472229),
 ('awesome', 0.4975941777229309),
 ('huuuuge', 0.4973853528499603),
 ('great', 0.4973595142364502),
 ('tricky', 0.49431976675987244)]

In [29]:
model_ug_dmc.most_similar('happy')

  """Entry point for launching an IPython kernel.


[('blessed', 0.5467841625213623),
 ('omgtr', 0.5459575057029724),
 ('thankful', 0.5232704877853394),
 ('proud', 0.5022544860839844),
 ('uncomfortable', 0.49687355756759644),
 ('excited', 0.4927910566329956),
 ('stormy', 0.47982290387153625),
 ('debuted', 0.47648757696151733),
 ('ivotedfortrump', 0.47207677364349365),
 ('grandioses', 0.47205495834350586)]

In [30]:
model_ug_dmc.most_similar('facebook')

  """Entry point for launching an IPython kernel.


[('twitter', 0.5870563983917236),
 ('reddit', 0.574428379535675),
 ('youtube', 0.5729553699493408),
 ('instagram', 0.5629979372024536),
 ('fb', 0.5402119755744934),
 ('tvone', 0.5367606282234192),
 ('biography', 0.5334429740905762),
 ('raskass', 0.5183946490287781),
 ('nanotechnology', 0.5081194043159485),
 ('whatsapp', 0.5031887888908386)]

In [31]:
model_ug_dmc.most_similar('trump')

  """Entry point for launching an IPython kernel.


[('lyincrookedhillary', 0.604820966720581),
 ('islamaphobia', 0.5932666659355164),
 ('detractors', 0.588485598564148),
 ('plagio', 0.5837088227272034),
 ('tryi', 0.581690788269043),
 ('trumps', 0.5771034955978394),
 ('rumsfeld', 0.5755376815795898),
 ('realdonaldt', 0.5662996768951416),
 ('juanwilliams', 0.5599595308303833),
 ('moretti', 0.5520181059837341)]

In [32]:
model_ug_dmc.most_similar(positive=['bigger', 'small'], negative=['big'])

  """Entry point for launching an IPython kernel.


[('tougher', 0.4775066375732422),
 ('prettier', 0.4615219533443451),
 ('scarier', 0.4473501443862915),
 ('lice', 0.4442978501319885),
 ('humane', 0.44116953015327454),
 ('warmer', 0.43999165296554565),
 ('stronger', 0.4343826472759247),
 ('participating', 0.4324437081813812),
 ('pain', 0.42833268642425537),
 ('hotter', 0.42792409658432007)]

In [33]:
train_vecs_dmc = get_vectors(model_ug_dmc, x_train, 100)
validation_vecs_dmc = get_vectors(model_ug_dmc, x_validation, 100)

In [34]:
clf = LogisticRegression()
clf.fit(train_vecs_dmc, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [35]:
clf.score(validation_vecs_dmc, y_validation)

0.5610479485912012

In [36]:
model_ug_dmc.save('d2v_model_ug_dmc.doc2vec')
model_ug_dmc = Doc2Vec.load('d2v_model_ug_dmc.doc2vec')
model_ug_dmc.delete_temporary_training_data(keep_doctags_vectors=True, keep_inference=True)

#### Distributed memory (mean)