# Twitterでフォロワーをクラスタリングしてみた。

## 目次
---
- フォロワー同士の関係を取得
- クラスタリング
- 属性分析

In [1]:
import re
import os, sys
sys.path.append('../')

import psycopg2
import collections
from tqdm import tqdm

import MeCab
import numpy as np
import pandas as pd

import warnings
warnings.filterwarnings('ignore')

import networkx as nx
import matplotlib.pyplot as plt
%matplotlib inline

## フォロワー同士の関係を取得
---

In [2]:
# 接続情報
connection_config = {
    'host': os.environ['RDS_HOST'],
    'port': '5432',
    'database': os.environ['RDS_DATABASE'],
    'user': os.environ['RDS_USERNAME'],
    'password': os.environ['RDS_PASSWORD']
}

conn = psycopg2.connect(**connection_config)

In [3]:
q_screen_name = 'rui_308'
q = "SELECT * FROM for_research.user_info where slave_screen_name='%s'" % q_screen_name

follower_df = pd.read_sql(sql=q, con=conn)

In [4]:
follower_df.head()

Unnamed: 0,slave_screen_name,user_id,screen_name,friends_count,followers_count,self_description,protected
0,rui_308,1027812767661678592,chelios406,35,19,シェブ チェリオスです。充電しないと即停止します。面白いツイートはできません。基本、夜しか出...,0
1,rui_308,473583122,hasamarhythm,1080,794,みんなもレッツ諦念,0
2,rui_308,742897072228044800,ke_mirakao8,102,22,大学生。クリープハイプ/andymori/ナードマグネット/teto/リーガルリリー/Hel...,0
3,rui_308,3184817456,JSUBC,1194,1201,J-subculture,0
4,rui_308,3220618783,Luisthebestcat,81,44,I love it all japanese music like indigo la En...,0


In [5]:
follower_ids = follower_df['user_id']

In [6]:
q = "select * from for_research.friend_ids"

friends_df = pd.read_sql(sql=q, con=conn)

In [7]:
friends_df['com_friend_ids'] = friends_df['friend_ids'].str.split(',') \
    .apply(lambda x: np.array(x).astype(int)) \
    .apply(lambda x: list(set(x) & set(follower_ids)))

In [8]:
friends_df.head()

Unnamed: 0,user_id,screen_name,friend_ids,com_friend_ids
0,762311395354435584,ace_0815,"242298498,708952352288481280,84017558035607961...","[947833242278694912, 242298498, 85504930587848..."
1,942665865232760832,utsusemi_end,"559501103,480876175,912664232881094656,3145665...","[754466790403518464, 948638887, 3221413352, 23..."
2,804375248355213313,p_z4s3,"3308626946,2962184605,3145665278,9575947655030...","[933697292909010944, 941983908576370688, 82123..."
3,973092802925010945,fugutaruto0922,"1015164230041915394,1027199962168188930,953892...","[1032537382073982976, 868816424667455489, 1015..."
4,943103693464903681,1O_ooll,"986929396085747713,845294005210378241,85895520...","[788274867141349376, 924582695358697472, 81913..."


In [9]:
friends_df.shape

(395, 4)

In [10]:
friends_df.dtypes

user_id            int64
screen_name       object
friend_ids        object
com_friend_ids    object
dtype: object

In [11]:
print ('組み合せ総数：', friends_df['com_friend_ids'].apply(lambda x: len(x)).sum()/2)

組み合せ総数： 5474.5


## クラスタリング
---
1. `user_id`と`screen_name`が対応する辞書を作る
- `friends_df['com_friend_screen_name']`の定義
- グラフを定義
    - `q_screen_name`と対象フォロワーを繋げる
    - 対象フォロワー同士のエッジを追加

### user_idとscreen_nameが対応する辞書を作る
friends_df['com_friend_ids']に対応して、friends_df['com_friend_screen_name']を定義

In [12]:
dict_id_to_name = follower_df[['user_id', 'screen_name']].set_index('user_id').T.to_dict()

In [13]:
# screen_nameの呼び出し方法
dict_id_to_name[761867125589807104]['screen_name']

'ra_0y'

### friends_df['com_friend_screen_name']の定義

In [14]:
friends_df['com_friend_screen_name'] = friends_df['com_friend_ids'].apply(lambda x: [dict_id_to_name[i]['screen_name'] for i in x])

In [15]:
friends_df.head()

Unnamed: 0,user_id,screen_name,friend_ids,com_friend_ids,com_friend_screen_name
0,762311395354435584,ace_0815,"242298498,708952352288481280,84017558035607961...","[947833242278694912, 242298498, 85504930587848...","[aiueokrock, GOLDEN_PIGS, RyuyaK1208, aoboshi_s]"
1,942665865232760832,utsusemi_end,"559501103,480876175,912664232881094656,3145665...","[754466790403518464, 948638887, 3221413352, 23...","[yutokk00, shuhei_t_, arca__info, maropoti09, ..."
2,804375248355213313,p_z4s3,"3308626946,2962184605,3145665278,9575947655030...","[933697292909010944, 941983908576370688, 82123...","[sumika_______04, flumpool846, rock_RAD3612, s..."
3,973092802925010945,fugutaruto0922,"1015164230041915394,1027199962168188930,953892...","[1032537382073982976, 868816424667455489, 1015...","[K13ODA9hJR8qPJe, ALXD_Kick_Spin, FullspecHLD,..."
4,943103693464903681,1O_ooll,"986929396085747713,845294005210378241,85895520...","[788274867141349376, 924582695358697472, 81913...","[SUPERROCKBAND, toshiki_drum163, toumorocoshi4..."


### グラフを定義
1. `q_screen_name`と対象フォロワーを繋げる
- 対象フォロワー同士のエッジを追加

In [16]:
# グラフの定義
G_follower = nx.Graph()
targets = friends_df[['screen_name', 'com_friend_screen_name']]

for t, com_names in targets.values:
    for com_name in com_names:
        G_follower.add_edge(t, com_name)

In [17]:
cliques = nx.node_clique_number(G_follower)

In [18]:
cluster_dic = {}

for screen_name, cluster in cliques.items():
    if not cluster in cluster_dic.keys():
        cluster_dic[cluster] = list()
    cluster_dic[cluster].append(screen_name)
    

## 属性分析
----

### 言語処理
- プロフィールから名詞のみを抽出
    - self_discription -> nounsに変換

In [133]:
def get_nouns(sentence):
    mt = MeCab.Tagger(' -d /usr/local/lib/mecab/dic/mecab-ipadic-neologd')
    mt.parse('')
    node = mt.parseToNode(sentence)

    nouns = list()
    while node:
        if node.feature.split(',')[0]=='名詞':
            nouns.append(node.surface)
        node = node.next

    dst = ','.join(nouns)
    return dst

#半角小文字
def islower(s):
    lowerReg = re.compile(r'^[a-z]+$')
    return lowerReg.match(s) is not None

#半角数字
def isdigit(s):
    digitReg = re.compile(r'^[0-9]+$')
    return digitReg.match(s) is not None

In [40]:
cluster_dic.keys()

dict_keys([2, 3, 4, 5, 7, 8, 6])

In [136]:
cluster_df = follower_df.copy()
cluster_df['nouns'] = cluster_df['self_description'].apply(lambda x: get_nouns(x))
cluster_df['class'] = None

In [147]:
# cluster_dfにクラスを割り当て
for c, cluster_users in cluster_dic.items():
    cluster_df.loc[cluster_df.screen_name.isin(cluster_users), 'class'] = c

In [148]:
cluster_df['self_description'][2]

'大学生。クリープハイプ/andymori/ナードマグネット/teto/リーガルリリー/Helsinki Lambda Club/My Hair is Bad/植田真梨恵/ヨルシカ 等色々聴きます。モンスト(ランク280運極40)と刀剣乱舞(2年目)ものんびりやってます。無言フォロー失礼します。'

In [156]:
cluster_df['class'].isnull().sum(), cluster_df.shape[0]

(2792, 5361)

In [157]:
cluster_df['class'].value_counts()

2    1710
3     608
4     150
8      31
5      31
6      25
7      14
Name: class, dtype: int64

In [171]:
# 各クラスのワードを取得
cluster_word_dic = {}

for cluster in cluster_dic.keys():
    if not cluster in cluster_word_dic.keys():
        cluster_word_dic[cluster] = list()
    
    t_cluster_df = cluster_df[cluster_df['class']==cluster]
    all_description = ','.join(t_cluster_df.nouns.tolist())
    all_description_list = all_description.split(',')

    c = collections.Counter(all_description_list)

    cluster_words = c.most_common()

    for word, cnt in tqdm(c.most_common()):
        if isdigit(word) or islower(word) or word.strip()=='':
            cluster_words.remove((word, cnt))
    
    cluster_word_dic[cluster] = dict(cluster_words)

100%|██████████| 7801/7801 [00:00<00:00, 30160.55it/s]
100%|██████████| 3863/3863 [00:00<00:00, 58741.03it/s]
100%|██████████| 1163/1163 [00:00<00:00, 139898.35it/s]
100%|██████████| 313/313 [00:00<00:00, 111672.10it/s]
100%|██████████| 145/145 [00:00<00:00, 93421.52it/s]
100%|██████████| 333/333 [00:00<00:00, 135195.36it/s]
100%|██████████| 177/177 [00:00<00:00, 57787.17it/s]


In [200]:
data = []
for cluster in cluster_dic.keys():
    data.append(list(cluster_word_dic[cluster].keys())[:20])

In [204]:
# 全クラスのワードを取得
t_cluster_df = cluster_df
all_description = ','.join(t_cluster_df.nouns.tolist())
all_description_list = all_description.split(',')

c = collections.Counter(all_description_list)

cluster_words = c.most_common()

for word, cnt in tqdm(c.most_common()):
    if isdigit(word) or islower(word) or word.strip()=='':
        cluster_words.remove((word, cnt))

100%|██████████| 19418/19418 [00:01<00:00, 10999.84it/s]


In [214]:
cluster_word_result = pd.DataFrame(np.array(data).T, columns=cluster_dic.keys())
cluster_word_result['all'] = np.array(cluster_words)[:20, 0]

In [215]:
cluster_word_result

Unnamed: 0,2,3,4,5,7,8,6,all
0,ロック,邦,ロック,FMS,研,研,研,好き
1,邦,ロック,邦,研,FMS,FMS,FMS,ロック
2,好き,好き,好き,☺,垢,B4,M,邦
3,フォロー,フォロー,垢,KEYTALK,フォロー,明治大学,Meiji,フォロー
4,無言,無言,フォロー,同盟,NCC,垢,B4,音楽
5,垢,垢,無言,➡,B4,宮下,明治,垢
6,音楽,大好き,KEYTALK,明治,二,コン,数理,無言
7,バンド,フォロバ,ラジ友,フォロバ,明治,班,4年,さん
8,ライブ,音楽,オーラル,邦,M,TOEIC,Fes,バンド
9,さん,参戦,キュウソ,ロック,映像,Web,音楽,大好き


In [95]:
# from sklearn.feature_extraction.text import TfidfVectorizer

# vec = TfidfVectorizer(max_df=10)
# all_description = ','.join(cluster_df.self_description.tolist())
# all_description_list = all_description.split(',')
# term_doc = vec.fit_transform(all_description_list)

# vec.vocabulary_

## 可視化

In [73]:
import matplotlib.pyplot as plt
from wordcloud import WordCloud

In [32]:
_all_description_list = all_description_list

for word in tqdm(_all_description_list):
    if isdigit(word) or islower(word) or word.strip()=='':
        all_description_list.remove(word)

 98%|█████████▊| 1765/1807 [00:00<00:00, 208410.42it/s]


In [33]:
fpath = "~/Library/Fonts/ヒラギノ丸ゴ ProN W4.ttc"
temp = ' '.join(all_description_list)
wordcloud = WordCloud(background_color="white", font_path=fpath, width=1200, height=800, ranks_only=True).generate(temp)

In [37]:
# plt.figure(figsize=(12,8))
# plt.imshow(wordcloud)
# plt.axis("off")
# plt.show()