In [1]:
# Import whole libraries
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

# Specfic library functions
from sklearn.pipeline import Pipeline

# Home-made modules and functions
from mpcorpusreader import MPTweetCorpusReader

# External loading!
%load_ext autoreload
%autoreload 2

In [2]:
from sklearn.base import BaseEstimator, TransformerMixin
from nltk.cluster import KMeansClusterer
from nltk.cluster.util import cosine_distance

class KMeansClusters(BaseEstimator, TransformerMixin,
                     KMeansClusterer):
    """
    A sklearn-API compatible KMeans-clusterer using nltk's 
    implementation which enable use of cosine-distane metric
    """
    
    def __init__(self, k=5, distance=cosine_distance, repeats=3):
        """
        self.k: Number of clusers.
        self.distnace: Distance metric for clustering
        self.model: nltk-implementation of KMeans
        """
        self.k = k
        KMeansClusterer.__init__(self, num_means=self.k, distance=distance,
                                    avoid_empty_clusters=True)
        
    def fit(self, documents, labels=None):
        return self
    
    def transform(self, documents):
        """
        Fits the K-means model a vector repr. of documents.
        """
        return self.cluster(documents, assign_clusters=True)

In [3]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from sklearn.utils import shuffle
import multiprocessing

df = pd.read_pickle('corpus/tweet_df.pkl')
df_user = pd.read_pickle('corpus/user_df.pkl')

cores = multiprocessing.cpu_count()
embedding = Doc2Vec(workers=cores)

tagged = [
    TaggedDocument(tweet.split(), tags=['tweet_'+str(idx), user, party])
    for idx, user, party, tweet in df[['user', 'party', 'text']].itertuples()
]


embedding.build_vocab(tagged)
embedding.train(shuffle(tagged), len(tagged), epochs=10)

X = np.zeros((len(df['user'].unique()), 100))
for ix, user in enumerate(df['user'].unique()):
    X[ix] = embedding.docvecs[user]

In [4]:
cluster = KMeansClusters()
df_user['cluster'] = cluster.fit_transform(X)

In [5]:
idx = 17
df.loc[17, 'text']

'RT @redhillfield: Thank you to everyone for celebrating our 35th anniversary in such a wonderful way. So proud to be part of our great lear…'

In [6]:
infer_vector = embedding.infer_vector(df.loc[17, 'text'])
embedding.docvecs.most_similar([infer_vector])

[('Andrea Leadsom', 0.979690432548523),
 ('Conservative', 0.97930908203125),
 ('Chloe Smith', 0.9792575836181641),
 ('tweet_159', 0.9791668653488159),
 ('Alan Mak', 0.9790177345275879),
 ('Boris Johnson', 0.9788397550582886),
 ('Chris Green', 0.9788180589675903),
 ('Alex Chalk', 0.9786611795425415),
 ('Andrew Jones', 0.9785363674163818),
 ('Caroline Spelman', 0.9784775972366333)]

In [7]:
infer_vector

array([ 0.0002142 ,  0.00394494, -0.02207178,  0.01018356, -0.01337042,
        0.00833286,  0.00536455, -0.01779371,  0.01657497,  0.0070423 ,
        0.00594908, -0.00219435,  0.00582279, -0.01242191, -0.02592236,
        0.00942667, -0.00652943, -0.01696626, -0.00352099, -0.01430876,
       -0.0009853 , -0.00749709, -0.01274483, -0.00310236, -0.01439537,
       -0.01136095,  0.02176287, -0.02524291, -0.01013945,  0.01562772,
        0.0117149 , -0.00767352, -0.00649277,  0.00061482,  0.01650634,
        0.01341147, -0.00130015, -0.01532868, -0.00329674, -0.01145836,
       -0.00908725, -0.00821133,  0.00424836,  0.01825471,  0.00896086,
       -0.00082403,  0.00214939, -0.02232845, -0.00493732,  0.00554375,
       -0.01812016,  0.01444871,  0.00540727,  0.00028708,  0.01148098,
        0.02407753, -0.00097761,  0.01915943,  0.01524527, -0.01255484,
       -0.0247018 ,  0.02284004, -0.02905793, -0.00795073, -0.0149596 ,
       -0.01912027, -0.02632896,  0.01299851, -0.00951121,  0.01

In [11]:
df_user['cluster']

user
Adam Afriyie            2
Alan Duncan             1
Alan Mak                3
Alberto Costa           1
Alec Shelbrooke         2
Alex Burghart           1
Alex Chalk              2
Alok Sharma             1
Alun Cairns             1
Amanda Milling          2
Amber Rudd              1
Andrea Jenkyns          2
Andrea Leadsom          1
Andrew Bowie            3
Andrew Bridgen          0
Andrew Jones            2
Andrew Lewer            4
Andrew Rosindell        2
Andrew Selous           1
Andrew Stephenson       1
Anne Marie Morris       4
Anne-Marie Trevelyan    1
Ben Bradley             2
Ben Wallace             2
Bernard Jenkin          1
Bim Afolami             1
Bob Blackman            2
Bob Seely               1
Boris Johnson           2
Brandon Lewis           3
Caroline Dinenage       0
Caroline Spelman        1
Cheryl Gillan           1
Chloe Smith             1
Chris Green             1
Chris Heaton-Harris     1
Chris Philp             2
Name: cluster, dtype: int64

In [22]:
concat = ' '.join(df_user.loc[df_user['cluster'] == cluster, 'text'])


In [23]:
concat

'Have been dealing with several issues around work visas (teacher &amp; an airport employee), parking problems/fines, access to education for someone’s young child with autism, plus inevitable concerns about noise from Heathrow ✈️ - we fight on! After a quiet evening at The Barley Mow at Englefield Green, the weather’s looking good today and I’m doing some casework with an active day across the constituency. @bmenglefield @WindsorTories Good news for the UK. As the PM’s Trade Envoy to Ghana &amp; Guinea, I’m very much aware that ship building can be a great export too! https://t.co/FJQZ0VYlmC The green belt helps make our constituency a beautiful place to live. So I’m delighted to say that today we’ve secured £50k from Government, for the Royal Borough of Windsor &amp; Maidenhead to clamp down on illegal building on the green belt-just great!  @rbwm @mhclg @WindsorTories 🚨 Attention Datchet! 🚨\n\nYour local @PostOffice is in dire need of a new postmaster!\n\nOur Post Offices provide a 

In [24]:
from nltk.tokenize import TweetTokenizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

vectorizer = TfidfVectorizer(tokenizer=TweetTokenizer().tokenize,
                                   stop_words='english',
                                   ngram_range=(2, 3))

cluster_text = [' '.join(df_user.loc[df_user['cluster'] == cluster, 'text'])
 for cluster in df_user['cluster'].unique()]



NameError: name 'central_' is not defined

In [8]:
for text in df['text']:
    print(text)

Have been dealing with several issues around work visas (teacher &amp; an airport employee), parking problems/fines, access to education for someone’s young child with autism, plus inevitable concerns about noise from Heathrow ✈️ - we fight on!
After a quiet evening at The Barley Mow at Englefield Green, the weather’s looking good today and I’m doing some casework with an active day across the constituency. @bmenglefield @WindsorTories
Good news for the UK. As the PM’s Trade Envoy to Ghana &amp; Guinea, I’m very much aware that ship building can be a great export too! https://t.co/FJQZ0VYlmC
The green belt helps make our constituency a beautiful place to live. So I’m delighted to say that today we’ve secured £50k from Government, for the Royal Borough of Windsor &amp; Maidenhead to clamp down on illegal building on the green belt-just great!  @rbwm @mhclg @WindsorTories
🚨 Attention Datchet! 🚨

Your local @PostOffice is in dire need of a new postmaster!

Our Post Offices provide a vital