In [2]:
import pandas as pd, numpy as np
from langdetect import detect 
from text_token import column_token
from gensim.models import word2vec

pd.options.display.max_colwidth = 200

In [3]:
df = pd.read_csv('data/final_palantir_following.csv')
del df['Unnamed: 0']
df.head(2)

Unnamed: 0,screenName,bio
0,AirAsia,"Welcome to our official Twitter account where we tweet exclusive updates, contests & our low fares. For Customer Support, kindly tweet @AirAsiaSupport"
1,JamesCarville,Political strategist. Ragin’ Cajun. Father of two. Louisiana Dem. Gulf Coaster.


In [4]:
df2 = column_token(df, 'bio')
df2.head()

Unnamed: 0,screenName,bio,clean_text
0,AirAsia,"Welcome to our official Twitter account where we tweet exclusive updates, contests & our low fares. For Customer Support, kindly tweet @AirAsiaSupport","[welcome, official, twitter, account, tweet, exclusive, updates, contests, low, fares, customer, support, kindly, tweet, airasiasupport]"
1,JamesCarville,Political strategist. Ragin’ Cajun. Father of two. Louisiana Dem. Gulf Coaster.,"[political, strategist, ragin, cajun, father, two, louisiana, dem, gulf, coaster]"
2,Ferrari,"Official account of #Ferrari, \nItalian Excellence that makes the world dream.","[official, account, ferrari, italian, excellence, makes, world, dream]"
3,AUSTRAC,"Official account for AUSTRAC, Australia's financial intelligence agency. For more info or to contact us, please visit our website.","[official, account, austrac, australia, s, financial, intelligence, agency, info, contact, us, please, visit, website]"
4,fcagroup,Welcome to the official Fiat Chrysler Automobiles account. #fiatchrysler #fcagroup,"[welcome, official, fiat, chrysler, automobiles, account, fiatchrysler, fcagroup]"


In [5]:
from doc_embed import averaged_word_vectorizer

In [6]:
# Set values for various parameters
feature_size = 10    # Word vector dimensionality  
window_context = 10          # Context window size                                                                                    
min_word_count = 1   # Minimum word count                        
sample = 1e-3   # Downsample setting for frequent words

w2v_model = word2vec.Word2Vec(df2['clean_text'], size=feature_size, 
                              window=window_context, min_count = min_word_count,
                              sample=sample, iter=100)

In [7]:
w2v_feature_array = averaged_word_vectorizer(corpus=df2['clean_text'], model=w2v_model,
                                             num_features=feature_size)
#pd.DataFrame(w2v_feature_array)

In [8]:
from sklearn.cluster import KMeans

In [9]:
k = 4
km_model = KMeans(n_clusters=k, n_jobs=-1)
km_model.fit(pd.DataFrame(w2v_feature_array))

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
    n_clusters=4, n_init=10, n_jobs=-1, precompute_distances='auto',
    random_state=None, tol=0.0001, verbose=0)

In [10]:
df2['labels'] = km_model.labels_

In [11]:
df2

Unnamed: 0,screenName,bio,clean_text,labels
0,AirAsia,"Welcome to our official Twitter account where we tweet exclusive updates, contests & our low fares. For Customer Support, kindly tweet @AirAsiaSupport","[welcome, official, twitter, account, tweet, exclusive, updates, contests, low, fares, customer, support, kindly, tweet, airasiasupport]",1
1,JamesCarville,Political strategist. Ragin’ Cajun. Father of two. Louisiana Dem. Gulf Coaster.,"[political, strategist, ragin, cajun, father, two, louisiana, dem, gulf, coaster]",2
2,Ferrari,"Official account of #Ferrari, \nItalian Excellence that makes the world dream.","[official, account, ferrari, italian, excellence, makes, world, dream]",2
3,AUSTRAC,"Official account for AUSTRAC, Australia's financial intelligence agency. For more info or to contact us, please visit our website.","[official, account, austrac, australia, s, financial, intelligence, agency, info, contact, us, please, visit, website]",1
4,fcagroup,Welcome to the official Fiat Chrysler Automobiles account. #fiatchrysler #fcagroup,"[welcome, official, fiat, chrysler, automobiles, account, fiatchrysler, fcagroup]",2
5,CreditSuisse,"Credit Suisse is a leading wealth manager, with strong investment banking capabilities. Also follow us on @csapac and @csschweiz","[credit, suisse, leading, wealth, manager, strong, investment, banking, capabilities, also, follow, us, csapac, csschweiz]",1
6,merckgroup,"A leading science and technology company in healthcare, life science, performance materials. Not intended for US&CA visitors. https://t.co/wh24XCVhmE","[leading, science, technology, company, healthcare, life, science, performance, materials, intended, us, ca, visitors, https, t, co, wh, xcvhme]",0
7,Airbus,Live updates from the people that build the world's best planes and pioneer the future of aerospace. #WeMakeItFly ✈️,"[live, updates, people, build, world, s, best, planes, pioneer, future, aerospace, wemakeitfly]",1
8,futureofprivacy,"A catalyst for #privacy leadership & scholarship, advancing principled data practices in support of emerging #tech. Get updates: https://t.co/P9PvpEAnfJ","[catalyst, privacy, leadership, scholarship, advancing, principled, data, practices, support, emerging, tech, get, updates, https, t, co, p, pvpeanfj]",0
9,JulesPolonetsky,"CEO @futureofprivacy, Co-Chair @ILTechPolicy, won't speak on all male panels.","[ceo, futureofprivacy, co, chair, iltechpolicy, won, t, speak, male, panels]",1


In [19]:
df2.groupby('labels').count()

Unnamed: 0_level_0,screenName,bio,clean_text
labels,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,5,5,5
1,4,4,4
2,1,1,1
3,10,10,10


# Diagnosis

In [20]:
from sklearn.metrics import pairwise_distances_argmin_min

In [46]:
cluster_number = 3

df_sub0 = df2[df2['labels'] == cluster_number]
w2v_feature_array0 = averaged_word_vectorizer(corpus=df_sub0['clean_text'], model=w2v_model,
                                             num_features=feature_size)
df_sparse0 = pd.DataFrame(w2v_feature_array0)

closest, _ = pairwise_distances_argmin_min(km_model.cluster_centers_[cluster_number].reshape(1, -1), df_sparse0)

df_sub0.iloc[closest]

Unnamed: 0,screenName,bio,clean_text,labels
3,AUSTRAC,"Official account for AUSTRAC, Australia's financial intelligence agency. For more info or to contact us, please visit our website.","[official, account, austrac, australia, s, financial, intelligence, agency, info, contact, us, please, visit, website]",3


In [47]:
df_sub0

Unnamed: 0,screenName,bio,clean_text,labels
0,AirAsia,"Welcome to our official Twitter account where we tweet exclusive updates, contests & our low fares. For Customer Support, kindly tweet @AirAsiaSupport","[welcome, official, twitter, account, tweet, exclusive, updates, contests, low, fares, customer, support, kindly, tweet, airasiasupport]",3
3,AUSTRAC,"Official account for AUSTRAC, Australia's financial intelligence agency. For more info or to contact us, please visit our website.","[official, account, austrac, australia, s, financial, intelligence, agency, info, contact, us, please, visit, website]",3
5,CreditSuisse,"Credit Suisse is a leading wealth manager, with strong investment banking capabilities. Also follow us on @csapac and @csschweiz","[credit, suisse, leading, wealth, manager, strong, investment, banking, capabilities, also, follow, us, csapac, csschweiz]",3
7,Airbus,Live updates from the people that build the world's best planes and pioneer the future of aerospace. #WeMakeItFly ✈️,"[live, updates, people, build, world, s, best, planes, pioneer, future, aerospace, wemakeitfly]",3
9,JulesPolonetsky,"CEO @futureofprivacy, Co-Chair @ILTechPolicy, won't speak on all male panels.","[ceo, futureofprivacy, co, chair, iltechpolicy, won, t, speak, male, panels]",3
10,CenDemTech,"The Center for Democracy & Technology. Shaping tech policy & architecture, with a focus on the rights of the individual. @CDTEU for our EU-based team.","[center, democracy, technology, shaping, tech, policy, architecture, focus, rights, individual, cdteu, eu, based, team]",3
11,DirectRelief,"Mission: Improve the health and lives of people affected by poverty or emergencies without regard to politics, religion, or ability to pay.","[mission, improve, health, lives, people, affected, poverty, emergencies, without, regard, politics, religion, ability, pay]",3
15,radar,O'Reilly Media's group blog about emerging technologies. Twitter feed managed by @JennWebb.,"[o, reilly, media, s, group, blog, emerging, technologies, twitter, feed, managed, jennwebb]",3
17,thejointstaff,"The official account for news on the #JointForce, #GenDunford, #GenSelva and Joint Staff services. (Following, RTs and links ≠ endorsement by @DeptofDefense)","[official, account, news, jointforce, gendunford, genselva, joint, staff, services, following, rts, links, endorsement, deptofdefense]",3
18,TeamRubicon,Team Rubicon unites the skills and experiences of military veterans with first responders to rapidly deploy emergency response teams.,"[team, rubicon, unites, skills, experiences, military, veterans, first, responders, rapidly, deploy, emergency, response, teams]",3


# Exploration

In [50]:
df_word2vec = pd.DataFrame(w2v_feature_array)

In [56]:
a = df_word2vec.iloc[7] # airbus
b = df_word2vec.iloc[0]  # air asia 

dist = np.linalg.norm(a-b)
print(dist)

0.08669840375984277


In [60]:
a = df_word2vec.iloc[7]  # airbus
b = df_word2vec.iloc[5]  # credit suisse 

dist = np.linalg.norm(a-b)
print(dist)

0.09752927886183509


In [61]:
abs(0.08669840375984277-0.09752927886183509)/0.09752927886183509

0.11105254984337459