# General imports.

In [1]:
import pandas as pd, numpy as np
from langdetect import detect 

pd.options.display.max_colwidth = 200

# Data.

In [2]:
df = pd.read_csv('data/final_palantir_following.csv')
del df['Unnamed: 0']
df.head(2)

Unnamed: 0,screenName,bio
0,AirAsia,"Welcome to our official Twitter account where we tweet exclusive updates, contests & our low fares. For Customer Support, kindly tweet @AirAsiaSupport"
1,JamesCarville,Political strategist. Ragin’ Cajun. Father of two. Louisiana Dem. Gulf Coaster.


In [3]:
df.shape

(20, 2)

In [46]:
df

Unnamed: 0,screenName,bio,clean_text
0,AirAsia,"Welcome to our official Twitter account where we tweet exclusive updates, contests & our low fares. For Customer Support, kindly tweet @AirAsiaSupport",welcom offici twitter account tweet exclus updat contest low fare custom support kind tweet airasiasupport
1,JamesCarville,Political strategist. Ragin’ Cajun. Father of two. Louisiana Dem. Gulf Coaster.,polit strategist ragin cajun father two louisiana dem gulf coaster
2,Ferrari,"Official account of #Ferrari, \nItalian Excellence that makes the world dream.",offici account ferrari italian excel make world dream
3,AUSTRAC,"Official account for AUSTRAC, Australia's financial intelligence agency. For more info or to contact us, please visit our website.",offici account austrac australia s financi intellig agenc info contact us pleas visit websit
4,fcagroup,Welcome to the official Fiat Chrysler Automobiles account. #fiatchrysler #fcagroup,welcom offici fiat chrysler automobil account fiatchrysl fcagroup
5,CreditSuisse,"Credit Suisse is a leading wealth manager, with strong investment banking capabilities. Also follow us on @csapac and @csschweiz",credit suiss lead wealth manag strong invest bank capabl also follow us csapac csschweiz
6,merckgroup,"A leading science and technology company in healthcare, life science, performance materials. Not intended for US&CA visitors. https://t.co/wh24XCVhmE",lead scienc technolog compani healthcar life scienc perform materi intend us ca visitor https t co wh xcvhme
7,Airbus,Live updates from the people that build the world's best planes and pioneer the future of aerospace. #WeMakeItFly ✈️,live updat peopl build world s best plane pioneer futur aerospac wemakeitfli
8,futureofprivacy,"A catalyst for #privacy leadership & scholarship, advancing principled data practices in support of emerging #tech. Get updates: https://t.co/P9PvpEAnfJ",catalyst privaci leadership scholarship advanc principl data practic support emerg tech get updat https t co p pvpeanfj
9,JulesPolonetsky,"CEO @futureofprivacy, Co-Chair @ILTechPolicy, won't speak on all male panels.",ceo futureofprivaci co chair iltechpolici won t speak male panel


# Features.

In [4]:
from text_processing import cleaning_string, cleaning_data
from sklearn.feature_extraction.text import TfidfVectorizer

In [5]:
df2 = cleaning_data(df, 'bio')
tfidf_vectorizer = TfidfVectorizer(max_df=0.9, min_df=2, use_idf=True)
df_sparse = tfidf_vectorizer.fit_transform(df2.clean_text.tolist())

# Models.

In [6]:
from sklearn.cluster import KMeans

In [7]:
k = 4
km_model = KMeans(n_clusters=k, n_jobs=-1)
km_model.fit(df_sparse)

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
    n_clusters=4, n_init=10, n_jobs=-1, precompute_distances='auto',
    random_state=None, tol=0.0001, verbose=0)

In [8]:
df2['labels'] = km_model.labels_

# Assessment

In [9]:
df2.groupby('labels').count()

Unnamed: 0_level_0,screenName,bio,clean_text
labels,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,5,5,5
1,2,2,2
2,8,8,8
3,5,5,5


# Get the centroid

In [10]:
from sklearn.metrics import pairwise_distances_argmin_min

In [30]:
cluster_number = 3
df_sub0 = df2[df2['labels'] == cluster_number]
df_sparse0 = tfidf_vectorizer.transform(df_sub0.clean_text.tolist())

closest, _ = pairwise_distances_argmin_min(km_model.cluster_centers_[cluster_number].reshape(1, -1), df_sparse0)

df_sub0.iloc[closest]

Unnamed: 0,screenName,bio,clean_text,labels
12,kncukier,"The Economist's Senior Editor. Coauthor of bestseller ""Big Data"". Board director at https://t.co/gE85O4kDEU and https://t.co/uSSaPIzEH0 Fellow at @OxfordSBS",economist s senior editor coauthor bestsel big data board director https t co ge o kdeu https t co ussapizeh fellow oxfordsb,3


In [31]:
df_sub0

Unnamed: 0,screenName,bio,clean_text,labels
8,futureofprivacy,"A catalyst for #privacy leadership & scholarship, advancing principled data practices in support of emerging #tech. Get updates: https://t.co/P9PvpEAnfJ",catalyst privaci leadership scholarship advanc principl data practic support emerg tech get updat https t co p pvpeanfj,3
9,JulesPolonetsky,"CEO @futureofprivacy, Co-Chair @ILTechPolicy, won't speak on all male panels.",ceo futureofprivaci co chair iltechpolici won t speak male panel,3
12,kncukier,"The Economist's Senior Editor. Coauthor of bestseller ""Big Data"". Board director at https://t.co/gE85O4kDEU and https://t.co/uSSaPIzEH0 Fellow at @OxfordSBS",economist s senior editor coauthor bestsel big data board director https t co ge o kdeu https t co ussapizeh fellow oxfordsb,3
13,valleyhack,Author of best seller on Elon Musk - https://t.co/ECPVUpFp90. Writer for Businessweek. Host of Hello World. https://t.co/5cDJBYlj0Z Former scribe for NYT.,author best seller elon musk https t co ecpvupfp writer businessweek host hello world https t co cdjbylj z former scribe nyt,3
16,acroll,Thinking about the intersection of tech & society. Once Extremely Online. @Strataconf @Startupfest https://t.co/nvU6C9c7Il @leananalytics @fwd50conf @scaletechconf,think intersect tech societi extrem onlin strataconf startupfest https t co nvu c c il leananalyt fwd conf scaletechconf,3


# Looking at the distances across data points

In [61]:
df_tfidf = pd.DataFrame(df_sparse.toarray())

In [62]:
a = df_tfidf.iloc[7]
b = df_tfidf.iloc[0]

dist = np.linalg.norm(a-b)
print(dist)

1.2860492194534463


In [63]:
a = df_tfidf.iloc[7]
b = df_tfidf.iloc[5]

dist = np.linalg.norm(a-b)
print(dist)

1.4142135623730951


In [65]:
abs(1.2860492194534463 - 1.4142135623730951)/1.414213562373095

0.09062587598480179