In [1]:
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans

num_lda_topics = 8

In [35]:
df = pd.read_csv('../data/preprocessed_data.csv')

In [36]:
# group df by twitter user id   
user_ids = df['twitter user id'].unique()

In [37]:
# create dataframe from user_ids with their class label
df_user_ids = pd.DataFrame(user_ids, columns=['twitter user id'])
df_user_ids['class'] = df_user_ids['twitter user id'].apply(lambda x: df[df['twitter user id'] == x]['class'].unique()[0])
df_user_ids.shape  


(151, 2)

In [38]:
# create dataframe for each class
df_user_ids_0 = df_user_ids[df_user_ids['class'] == 'no influencer']
df_user_ids_1 = df_user_ids[df_user_ids['class'] == 'nano']
df_user_ids_2 = df_user_ids[df_user_ids['class'] == 'micro']
df_user_ids_3 = df_user_ids[df_user_ids['class'] == 'macro']
df_user_ids_4 = df_user_ids[df_user_ids['class'] == 'mega']

In [39]:
# create test , train and validate for each class dataframe
df_user_ids_0_train = df_user_ids_0.sample(frac=0.8, random_state=0)
df_user_ids_0_test = df_user_ids_0.drop(df_user_ids_0_train.index)
df_user_ids_0_validate = df_user_ids_0_test.sample(frac=0.5, random_state=0)
df_user_ids_0_test = df_user_ids_0_test.drop(df_user_ids_0_validate.index)

In [40]:
df_user_ids_1_train = df_user_ids_1.sample(frac=0.8, random_state=0)
df_user_ids_1_test = df_user_ids_1.drop(df_user_ids_1_train.index)
df_user_ids_1_validate = df_user_ids_1_test.sample(frac=0.5, random_state=0)
df_user_ids_1_test = df_user_ids_1_test.drop(df_user_ids_1_validate.index)
df_user_ids_2_train = df_user_ids_2.sample(frac=0.8, random_state=0)
df_user_ids_2_test = df_user_ids_2.drop(df_user_ids_2_train.index)
df_user_ids_2_validate = df_user_ids_2_test.sample(frac=0.5, random_state=0)
df_user_ids_2_test = df_user_ids_2_test.drop(df_user_ids_2_validate.index)
df_user_ids_3_train = df_user_ids_3.sample(frac=0.8, random_state=0)
df_user_ids_3_test = df_user_ids_3.drop(df_user_ids_3_train.index)
df_user_ids_3_validate = df_user_ids_3_test.sample(frac=0.5, random_state=0)
df_user_ids_3_test = df_user_ids_3_test.drop(df_user_ids_3_validate.index)
df_user_ids_4_train = df_user_ids_4.sample(frac=0.8, random_state=0)
df_user_ids_4_test = df_user_ids_4.drop(df_user_ids_4_train.index)
df_user_ids_4_validate = df_user_ids_4_test.sample(frac=0.5, random_state=0)
df_user_ids_4_test = df_user_ids_4_test.drop(df_user_ids_4_validate.index)

In [41]:
# combine all dataframes having train in one them   
df_user_ids_train = pd.concat([df_user_ids_0_train, df_user_ids_1_train, df_user_ids_2_train, df_user_ids_3_train, df_user_ids_4_train])
df_user_ids_test = pd.concat([df_user_ids_0_test, df_user_ids_1_test, df_user_ids_2_test, df_user_ids_3_test, df_user_ids_4_test])
df_user_ids_validate = pd.concat([df_user_ids_0_validate, df_user_ids_1_validate, df_user_ids_2_validate, df_user_ids_3_validate, df_user_ids_4_validate])

In [42]:
df_user_ids_train.shape, df_user_ids_test.shape, df_user_ids_validate.shape

((121, 2), (15, 2), (15, 2))

In [46]:
# create new dataframe from df with only train user ids get all tweets for each user id
df_train = df[df['twitter user id'].isin(df_user_ids_train['twitter user id'])]
df_test = df[df['twitter user id'].isin(df_user_ids_test['twitter user id'])]
df_validate = df[df['twitter user id'].isin(df_user_ids_validate['twitter user id'])]

In [47]:
df_train.shape, df_test.shape, df_validate.shape

((649, 4), (66, 4), (76, 4))

In [49]:
df_train.to_csv('../data/train.csv', index=False)
df_test.to_csv('../data/test.csv', index=False)
df_validate.to_csv('../data/validate.csv', index=False)

In [23]:
df_nano = df[df['class'] == 'nano']
df_macro = df[df['class'] == 'macro']
df_mega = df[df['class'] == 'mega']
df_no_influencer = df[df['class'] == 'no influencer']
df_micro = df[df['class'] == 'micro']

print(df_nano.shape, df_macro.shape, df_mega.shape, df_no_influencer.shape, df_micro.shape)

(158, 4) (177, 4) (213, 4) (74, 4) (169, 4)


In [24]:
df_nano_grp = df_nano.groupby('twitter user id').agg({'texts': ' '.join})
df_macro_grp = df_macro.groupby('twitter user id').agg({'texts': ' '.join})
df_micro_grp = df_micro.groupby('twitter user id').agg({'texts': ' '.join})
df_mega_grp = df_mega.groupby('twitter user id').agg({'texts': ' '.join})
df_no_influencer_grp = df_no_influencer.groupby('twitter user id').agg({'texts': ' '.join})

In [25]:
df_nano_grp['text_length'] = df_nano_grp['texts'].apply(lambda x: len(x))
df_macro_grp['text_length'] = df_macro_grp['texts'].apply(lambda x: len(x))
df_micro_grp['text_length'] = df_micro_grp['texts'].apply(lambda x: len(x))
df_mega_grp['text_length'] = df_mega_grp['texts'].apply(lambda x: len(x))
df_no_influencer_grp['text_length'] = df_no_influencer_grp['texts'].apply(lambda x: len(x))

In [8]:
def get_df_with_cluster(df, num_topics):
    tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, max_features=1000)
    tf = tf_vectorizer.fit_transform(df['texts'])
    lda = LatentDirichletAllocation(n_components=num_topics, max_iter=25, learning_method='batch', 
                                random_state=42, evaluate_every= 0.5)
    lda.fit(tf)
    doc_topic_weights = lda.transform(tf)
    scaler = StandardScaler()
    nano_scaled = scaler.fit_transform(doc_topic_weights)
    kmeans = KMeans(n_clusters=5)
    kmeans.fit(nano_scaled)
    cluster_labels = kmeans.predict(nano_scaled)
    df['cluster'] = cluster_labels
    return df

In [9]:
df_nano_return = get_df_with_cluster(df_nano_grp, num_lda_topics)

In [14]:
df_nano_return

Unnamed: 0_level_0,Unnamed: 1_level_0,texts,text_length,cluster
cluster,twitter user id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,aaa1584ceaad35c0a52a42803965c189,rt braver crypto bear market giveaway back fir...,986,0
0,aeb262e188dfd453e16d7b5909874968,rt wl giveaway prizes 2 x allowlist spots ente...,804,0
0,5bb03559fe50a06839c1a7aef1c94ceb,rt going make time special banksy promise tsla...,168,0
0,f73ecbc0d4d8fc84718f80a3e2cb7d59,rt ready lion flying_saucer mint september eth...,91,0
1,f77549dbdcbf38126210e5d5a9db98eb,rt todays ga celebrate women ens amazing see g...,789,1
1,ef3ebcf58e6468ea3c2fb3ad79540ad4,rt receive 86 000 000 86 000 000 shib first 30...,775,1
1,3e506c96800b45814613cc46a241385a,rt one first gacc got 2 months ago stoned dmt ...,739,1
1,9874e41d54a664d5bb5ac76346903ac7,rt iota launched smart contract beta makes eth...,652,1
1,f8404b995e68fac31ac3f8318884a0a9,rt excited announce last round 1inch airdrop s...,88,1
1,e3b4e60374658a005fa49fdbad1d516c,rt asia continues board 1inch plugs klaytn ent...,53,1


In [None]:
# group dataframe by cluster and sort each cluster by text_length column
df_nano_return = df_nano_return.groupby('cluster').apply(lambda x: x.sort_values(['text_length'], ascending=False))

In [15]:
df_macro_return = get_df_with_cluster(df_macro_grp, num_lda_topics)

In [16]:
df_macro_return = df_macro_return.groupby('cluster').apply(lambda x: x.sort_values(['text_length'], ascending=False))

In [17]:
df_macro_return

Unnamed: 0_level_0,Unnamed: 1_level_0,texts,text_length,cluster
cluster,twitter user id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,383e21ef21cf28f9f3005c9fc6903109,peak bull like top 50 coins possible disrupt a...,1801,0
0,d86fbb678a84112f8914c63e46531d21,locking period double rewards low transactions...,1253,0
0,eb609388df96de828143f2830c89251a,anticipated earnings july 25th july 29th check...,906,0
0,c0d3cb706dd6ad09cb7e0b7858f98e63,good morning cole everyone happy labor day wee...,671,0
0,5be065e13b80dad05212f137e83deb38,winner trophy congratulations party_popper con...,657,0
0,991870dafafeceb067d2ef608daed64b,rt hon butiime order give force compensation s...,58,0
1,aec6c5d7bde4579b1af1e6f3dd067c57,rt bitmart monthly report december 2021 dizzy ...,1380,1
1,84993708d20e2dffb5807b6b510624f0,anatol lieven anyone expresses admiration anym...,1193,1
1,96e9d592dfc66654b653cb2822be1760,tiktok growing platform new creators get invol...,936,1
2,a6623aea011de0796dab71f92c921e7d,huge divergence small caps iwm qqq think fed m...,815,2


In [18]:
df_micro_return = get_df_with_cluster(df_micro_grp, num_lda_topics)
df_micro_return = df_micro_return.groupby('cluster').apply(lambda x: x.sort_values(['text_length'], ascending=False))
df_micro_return

Unnamed: 0_level_0,Unnamed: 1_level_0,texts,text_length,cluster
cluster,twitter user id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,787ad6a075042caeb746621efdbdcd85,halo web3 free play today want earn sol playin...,825,0
0,f3b13776eb061e4754123e31432f5b68,leggo lt 3 sol investment advice group busines...,636,0
0,31372df13f6fc329b520357ad3cde349,manage risk lose either way degen ape mentalit...,162,0
1,b8a86d5eaebddded3feaf7812f07fe0c,realizing thing defi blue chip industry young ...,1379,1
1,edbf78f8f8abfd87f8cbe08023ae0eaf,checked floor one close minted 2 75 eth 3 160 ...,1277,1
1,be9244da7b1a2b634d443aca3288a633,open secret almost nft marketplaces soon suppo...,1050,1
1,8c43bc7498c69a3678cf7b4853014351,tvl 2 91 billion usd market cap 12 billion con...,827,1
1,fb41227b22d727fbcff4fe780d849de6,apes got bagged bbby amc ape guess lol trader ...,580,1
1,e220b06b623b0b1a406e00e3167e5566,rt pow price go eth needs 600m per month new c...,171,1
1,d4ff198244a9671c78c4105f93085998,hmmm buy ape week amc ape units meme friendly ...,122,1


In [19]:
df_mega_return = get_df_with_cluster(df_mega_grp, num_lda_topics)
df_mega_return = df_mega_return.groupby('cluster').apply(lambda x: x.sort_values(['text_length'], ascending=False))
df_mega_return

Unnamed: 0_level_0,Unnamed: 1_level_0,texts,text_length,cluster
cluster,twitter user id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,d8816b8d2b001ffe1bf47c5e3b1d8a9e,alarm_clock hurry starting 30 minutes play_but...,1043,0
0,1cbf16d93cc6a672eaed9c16ac07f114,fine got amazing shoot dm rocket envelope_with...,331,0
1,f4bd8c36c48b0e7e12b37733d7d98dd6,rt dear polygon family dizzy something celebra...,1231,1
1,2e9a06568e70652d1ccedff0b433b43e,rt police_car_light 10 000 giveaway money_with...,1026,1
1,a895f3e34691f388ee48b833570e3690,13 step keycap_5 luna airdrop sent vested new ...,906,1
1,623113eb843163bf06c9e75d7ca56961,29 year old built career 2 side hustles earn 1...,876,1
1,a2f7726319df31c397e4ad794d9da217,national cancer survivors day help us celebrat...,704,1
1,5a3336a8e007d3848b30a8cab701f684,rt canada formally labels proud boys terrorist...,632,1
1,d3202f7bed1bc52575b315a8c55b68b6,good job drop dm guys speech_balloon fam fire ...,522,1
1,da3c028d3c264701fef2eded8281ec2f,niggaz eating swag wan na admit rt happy birth...,518,1


In [26]:
df_no_influencer_return = get_df_with_cluster(df_no_influencer_grp, num_lda_topics)
df_no_influencer_return = df_no_influencer_return.groupby('cluster').apply(lambda x: x.sort_values(['text_length'], ascending=False))
df_no_influencer_return

Unnamed: 0_level_0,Unnamed: 1_level_0,texts,text_length,cluster
cluster,twitter user id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,d75542f32c8d3c4cd3533f7c6ad1916f,cloud_with_lightning giveaways cloud_with_ligh...,458,0
0,7b6f2b51b08558e7aae334a806920107,rt police_car_light giveaway alert police_car_...,146,0
0,8ba8b37c646a965c8de4ffcd7598572c,wait egld conference 3rd november count rocket...,118,0
1,be558c5b8dd1a7b9b1ba05fbc324d693,great project revolutionarize cryptocurrency i...,1134,1
1,95b6f482041b5d683321aa3761f28c9d,best project 3rcqxzrzlshwhk5wblamutrga61suufvn...,351,1
1,d5dd10cd702a791660eb98f7a1a6871f,fire heart_on_fire fire fire heart_on_fire sun...,195,1
1,bece0f146543ad67467f7a16bb78e707,nice project smiling_face_with_heart eyes 3ytf...,125,1
1,e1ec91e2c7f3fab9f6d5b10039b58b19,picking 2 eth winners like rt tag 3 frens back...,107,1
1,f5e0db8da605dc466e4bebf0333938a0,introducing scrappyether double_exclamation_ma...,107,1
1,0ed9637249db91cb2c256ec156ce1977,updated price qtum bnb trx qtum 3 361 usd bnb ...,96,1
