In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD

In [2]:
df = pd.read_csv('twitter_human_bots_dataset.csv')
display(f"Shape: {df.shape}")

'Shape: (37438, 20)'

In [10]:
# Drop duplicate ID's
df = df.drop_duplicates(subset=['id'], keep='first')

# Turn numerical nulls -> 0
numericals = ['favourites_count', 'followers_count', 'friends_count', 'statuses_count', 'average_tweets_per_day', 'account_age_days']
nulls = df[numericals].isnull().sum()
df[numericals] = df[numericals].fillna(0)

# Turn string nulls -> ""
text_cols = ['description', 'location', 'screen_name']
nulls = df[text_cols].isnull().sum()
print(nulls)
df['description'] = df['description'].fillna("")
df['location'] = df['location'].fillna('unknown')

# Turn bools into int
bool_cols = ['geo_enabled', 'verified', 'default_profile', 'default_profile_image']
for col in bool_cols:
    df[col] = df[col].astype(int)


description    0
location       0
screen_name    0
dtype: int64


In [11]:
# drop url cols
# can turn into binary has url or not
drop_cols = ['profile_background_image_url', 'profile_image_url', 'created_at']
df = df.drop(columns=drop_cols)

KeyError: "['profile_background_image_url', 'profile_image_url'] not found in axis"

In [8]:
from sklearn.decomposition import TruncatedSVD


tfidf = TfidfVectorizer(max_features=500, stop_words='english')
tfidf_mat = tfidf.fit_transform(df['description'])

# compress 1000 columns down to just 10 "Topic" columns
num_topic = 10
svd = TruncatedSVD(n_components=num_topic, random_state=42)
svd_matrix = svd.fit_transform(tfidf_mat)

svd_df = pd.DataFrame(
    svd_matrix, 
    columns=[f"topic_{i}" for i in range(num_topic)]
)

df_final = pd.concat([df.reset_index(drop=True), svd_df.reset_index(drop=True)], axis=1)

print(f"Added {num_topic} text component features.")
print(df_final.shape)
display(df_final.head())

Added 10 text component features.
(37438, 28)


  U = Q @ Uhat
  U = Q @ Uhat
  U = Q @ Uhat


Unnamed: 0.1,Unnamed: 0,created_at,default_profile,default_profile_image,description,favourites_count,followers_count,friends_count,geo_enabled,id,...,topic_0,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,topic_7,topic_8,topic_9
0,0,2016-10-15 21:32:11,False,False,"Blame @xaiax, Inspired by @MakingInvisible, us...",4,1589,4,False,787405734442958848,...,0.001781,0.005883,-0.003099,5.2e-05,0.000155,-0.004142,0.006596,0.002607,-0.002992,0.000581
1,1,2016-11-09 05:01:30,False,False,Photographing the American West since 1980. I ...,536,860,880,False,796216118331310080,...,0.002054,0.015923,-0.005647,0.001754,0.000618,-0.007009,0.012344,0.005536,-0.012382,0.008377
2,2,2017-06-17 05:34:27,False,False,Scruffy looking nerf herder and @twitch broadc...,3307,172,594,True,875949740503859204,...,0.026887,0.17393,-0.204333,0.156629,-0.328408,-0.186608,-0.144921,-0.07227,0.03301,-0.02544
3,3,2016-07-21 13:32:25,True,False,Wife.Godmother.Friend.Feline Fanatic! Assistan...,8433,517,633,True,756119643622735875,...,0.004175,0.040474,-0.00481,-0.009537,-0.002713,-0.009579,0.058916,0.025927,-0.026591,0.025601
4,4,2012-01-15 16:32:35,False,False,Loan coach at @mancity & Aspiring DJ,88,753678,116,True,464781334,...,0.004174,0.020453,-0.010019,-0.000388,-0.004565,-0.007142,0.024102,0.012082,-0.007458,0.013882


In [None]:
# not sure what to do for location

In [6]:
# if people want to go tf_idf route, but I wasn't sure about this one 


# tfidf = TfidfVectorizer(max_features=1000, stop_words='english', dtype='float32')
# tfidf_mat = tfidf.fit_transform(df['description'])

# # name each column to that word
# tfidf_df = pd.DataFrame(
#     tfidf_mat.toarray(), 
#     columns=[f"{word}" for word in tfidf.get_feature_names_out()]
# )

# df = df.reset_index(drop=True)
# tfidf_df = tfidf_df.reset_index(drop=True)
# df_final = pd.concat([df, tfidf_df], axis=1)


# print(df_final.shape, df_final.head())


In [7]:
display(f"Shape: {df.shape}")

# Vectorize string fields

'Shape: (37438, 18)'