In [17]:
# Import whole libraries
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

# Specfic library functions
from sklearn.pipeline import Pipeline

# Home-made modules and functions
from mpcorpusreader import MPTweetCorpusReader

# External loading!
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [18]:
from sklearn.base import BaseEstimator, TransformerMixin
from nltk.cluster import KMeansClusterer
from nltk.cluster.util import cosine_distance

class KMeansClusters(BaseEstimator, TransformerMixin,
                     KMeansClusterer):
    """
    A sklearn-API compatible KMeans-clusterer using nltk's 
    implementation which enable use of cosine-distane metric
    """
    
    def __init__(self, k=5):
        """
        self.k: Number of clusers.
        self.distnace: Distance metric for clustering
        self.model: nltk-implementation of KMeans
        """
        self.k = k
        self.distance = cosine_distance
        KMeansClusterer.__init__(self, self.k, self.distance,
                                    avoid_empty_clusters=True)
        
    def fit(self, documents, labels=None):
        return self
    
    def transform(self, documents):
        """
        Fits the K-means model a vector repr. of documents.
        """
        return self.cluster(documents, assign_clusters=True)

In [19]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from sklearn.utils import shuffle
import multiprocessing

df = pd.read_pickle('corpus/tweet_df.pkl')
df_user = pd.read_pickle('corpus/user_df.pkl')

cores = multiprocessing.cpu_count()
embedding = Doc2Vec(workers=cores)

tagged = [
    TaggedDocument(tweet.split(), tags=['tweet_'+str(idx), user, party])
    for idx, user, party, tweet in df[['user', 'party', 'text']].itertuples()
]


embedding.build_vocab(tagged)
embedding.train(shuffle(tagged), len(tagged), epochs=10)

X = np.zeros((len(df['user'].unique()), 100))
for ix, user in enumerate(df['user'].unique()):
    X[ix] = embedding.docvecs[user]

In [20]:
cluster = KMeansClusters()
df_user['cluster'] = cluster.fit_transform(X)

In [28]:
idx = 17
df.loc[17, 'text']

'RT @sajidjavid: The #SpendingRound today isn’t just about numbers on a spreadsheet – it’s about making sure our public services have the fu…'

In [39]:
infer_vector = embedding.infer_vector(df.loc[17, 'text'])
embedding.docvecs.most_similar([infer_vector])

[('tweet_1357', 0.9916787147521973),
 ('Andrew Selous', 0.9884707927703857),
 ('Kerry McCarthy', 0.9884203672409058),
 ('Stephen Hepburn', 0.9873685836791992),
 ('Graham Stuart', 0.9873088598251343),
 ('Dr Lisa Cameron', 0.9868918061256409),
 ('Damian Hinds', 0.9862691760063171),
 ('Brandon Lewis', 0.9847385883331299),
 ('Joanna Cherry QC', 0.9845963716506958),
 ('tweet_1334', 0.984508752822876)]

In [32]:
infer_vector

array([ 1.4762747e-02,  3.3674113e-02, -1.5858870e-02,  5.1176615e-02,
       -6.5921165e-02,  1.0595961e-02,  5.1972351e-05, -1.3516556e-02,
       -2.7065841e-02,  4.6927210e-02,  3.6245945e-03,  8.2323067e-03,
       -1.8420672e-02,  1.6610589e-03, -2.6716447e-02, -2.9652877e-02,
        4.7162324e-02, -1.4260069e-02,  4.4220880e-02,  1.6220301e-02,
        1.1427360e-02, -1.3542630e-02, -3.9790086e-02, -4.7522383e-03,
       -3.2192047e-04, -7.1725687e-03, -4.9957968e-02,  8.3252892e-02,
        1.9314906e-03, -1.6297629e-02, -4.6066385e-02, -3.8607903e-03,
       -7.8736097e-03,  1.1164392e-02,  4.7877811e-02, -2.4158424e-02,
       -2.1894900e-02, -1.9637017e-02,  5.8120658e-04, -8.3656311e-03,
        1.0082898e-02,  8.4627233e-02,  1.1565660e-02,  5.0501049e-02,
       -4.5213182e-02, -1.4366756e-02,  2.2933714e-02, -8.1223458e-02,
       -4.7050670e-02,  3.6037289e-02, -2.2414474e-02,  1.2102568e-02,
       -7.5870246e-02, -6.3558526e-02,  5.0127339e-02, -3.4404416e-02,
      

In [44]:
for text in df['text']:
    print(text)

I welcome this great news for our military personnel and their families. A great start and lots more to do to suppo… https://t.co/oZF23fxI52
Delighted our Prime Minister has secured a new parliamentary session. The Queen’s Speech will usher in a new &amp; wort… https://t.co/qCQ1di9O8M
It was privilege to attend &amp; celebrate the 20th yr on the throne for His Royal Majesty, the Asantehene Otumfuo Osei… https://t.co/vDhZw1Tepy
With HS2 under review, it is now time to review (&amp; stop) a 3rd runway at Heathrow. It would be the most expensive a… https://t.co/AVbzqQ3XBD
Loving all the #Farm24 support today. It’s thanks to our trailblazing farmers that British produce truly is in a le… https://t.co/kwNt3H41Tk
Strategists be careful. If the Government thinks it can call an Election by declaring no confidence in itself, my h… https://t.co/OultWYr9Jv
US here we come! Melton Mowbray pork pies are a million times better than McDonald’s! US get ready for good eat-in… https://t.co/fW6ccpwGbo
Ac

RT @KevinJPringle: A decent poster for the next Yes campaign! #indyref2 https://t.co/sZpvxDGJM8
RT @BBCNewsnight: "Scotland didn’t vote for Brexit... and we’re getting very close to the situation where we’re going to have to hold a sec…
RT @pforpaddy: Holyrood v Westminster.Ken Clarke v Boris Johnson. John Bercow v Michael Gove. Caroline Lucas v Jacob Rees Mogg. The death o…
Scottish Older People's Assembly  @Scotopa is taking place on the 5th of October. Register on eventbrite today… https://t.co/7eInshefj0
RT @transcotland: As seen in the #ScotPfG, taking climate action is one of the priorities in our new draft National Transport Strategy, set…
RT @HolyroodDaily: Nicola Sturgeon has used her Programme for Government to reveal a raft of measures to tackle climate change, poverty and…
RT @NicolaSturgeon: Very good result in the Commons tonight (has a PM ever before lost his/her first Commons vote?). Next step now is to ge…
RT @MathesonMichael: Mission zero for transport:

✅ Over £500m 

In [42]:
df

'RT @lewis_goodall: Listening to Johnson’s choice of words, I am becoming more and more convinced he is going to try and withhold royal asse…'

In [31]:
embedding.docvecs['tweet_1357']

array([ 0.01214413,  0.05389059, -0.01622104,  0.06273834, -0.09434688,
        0.01293741,  0.00195615, -0.01839297, -0.0419389 ,  0.0618578 ,
        0.00552464,  0.01012808, -0.02906732, -0.00230042, -0.03674529,
       -0.03581558,  0.06836634, -0.02166407,  0.05825923,  0.02254649,
        0.02066812, -0.01930691, -0.04842953, -0.0054675 , -0.0033777 ,
       -0.0055858 , -0.06485027,  0.11606472, -0.00674247, -0.03399256,
       -0.05881923,  0.00211465, -0.00689974,  0.0152269 ,  0.06462114,
       -0.03928853, -0.02759632, -0.03038556,  0.01117666, -0.00476512,
        0.01455165,  0.11076656,  0.01712131,  0.06946161, -0.05525431,
       -0.02229617,  0.03463924, -0.10842063, -0.06700522,  0.04639573,
       -0.02340258,  0.01516672, -0.1052444 , -0.07842238,  0.06688546,
       -0.04245211,  0.0167925 ,  0.02325521, -0.03779552, -0.05625606,
        0.01862906, -0.03093114,  0.02423198, -0.01105568,  0.00720585,
        0.06286293, -0.00011876,  0.04906327, -0.0824777 , -0.07

[array([ 0.0196886 ,  0.03004109,  0.00273642,  0.03857538, -0.09545139,
        -0.00285172,  0.02180928, -0.02532185, -0.04347813,  0.06021056,
         0.02863275,  0.01038274, -0.0354941 , -0.01468381, -0.02363212,
        -0.04696798,  0.068734  ,  0.00410612,  0.04795825,  0.02266382,
         0.02486286, -0.00500788, -0.06924967, -0.02332333,  0.01471414,
         0.00514899, -0.04835189,  0.1066112 , -0.01759321, -0.03472201,
        -0.06274851, -0.00930953, -0.01326068,  0.01110438,  0.04361556,
        -0.03352385, -0.04016069, -0.05171088,  0.02717455,  0.00287262,
         0.02499306,  0.09505673,  0.03930772,  0.05220066, -0.05226597,
        -0.01854304,  0.02487451, -0.1123423 , -0.07096996,  0.05380477,
        -0.03017896,  0.01047595, -0.1165974 , -0.0603591 ,  0.05755799,
        -0.03931305,  0.03815719,  0.00970883, -0.04247025, -0.03106517,
         0.01875816, -0.04373091,  0.03607152, -0.02488699,  0.00763258,
         0.05058877, -0.00947796,  0.03069792, -0.0