# Load data and preprocess #

### This code is for loading the data from kaggle, performing any necessary transformations, and storing the modified results in a .csv file for easier access ###

In [1]:
# Automatically load changes in dependency files (may be unnecessary here, but useful tool in case you're modifying packages that this file relies on)
%load_ext autoreload
%autoreload 2

## First need to download the dataset from Kaggle ##
I moved the dataset to a local directory for better access to it, since the default installed in .cache/kagglehub

In [6]:

# %conda install kagglehub
import kagglehub

# Download latest version
path = kagglehub.dataset_download("goyaladi/twitter-bot-detection-dataset")

print("Path to dataset files:", path)

^C

Note: you may need to restart the kernel to use updated packages.
Downloading from https://www.kaggle.com/api/v1/datasets/download/goyaladi/twitter-bot-detection-dataset?dataset_version_number=2...


100%|██████████| 2.94M/2.94M [00:00<00:00, 9.42MB/s]

Extracting model files...
Path to dataset files: C:\Users\maden\.cache\kagglehub\datasets\goyaladi\twitter-bot-detection-dataset\versions\2





Channels:
 - defaults
 - conda-forge
Platform: win-64
Collecting package metadata (repodata.json): ...working... done
Solving environment: ...working... done

# All requested packages already installed.



In [None]:
import pandas as pd
import numpy as np
import matplotlib as plt
import seaborn as sns
import sklearn

import os

In [57]:
# Recursive find for bot data from current directory
path = None
for dirpath, dirnames, filenames in os.walk("."):
    for filename in filenames:
        if filename == "bot_detection_data.csv":
            path = os.path.join(dirpath, filename)

assert path is not None, "bot_detection_data.csv not found in current directory or any subdirectories"
print(f"Path to csv file: {path}")

Path to csv file: .\COSC325\COSC325_Final_Project\COSC325_Final\data\twitter-bot-detection-dataset\versions\2\bot_detection_data.csv


In [83]:
# Read in data into dataframe and get some preliminary information about dataset
df = pd.read_csv(path)

print(df.dtypes)

df.head()

User ID            int64
Username          object
Tweet             object
Retweet Count      int64
Mention Count      int64
Follower Count     int64
Verified            bool
Bot Label          int64
Location          object
Created At        object
Hashtags          object
dtype: object


Unnamed: 0,User ID,Username,Tweet,Retweet Count,Mention Count,Follower Count,Verified,Bot Label,Location,Created At,Hashtags
0,132131,flong,Station activity person against natural majori...,85,1,2353,False,1,Adkinston,2020-05-11 15:29:50,
1,289683,hinesstephanie,Authority research natural life material staff...,55,5,9617,True,0,Sanderston,2022-11-26 05:18:10,both live
2,779715,roberttran,Manage whose quickly especially foot none to g...,6,2,4363,True,0,Harrisonfurt,2022-08-08 03:16:54,phone ahead
3,696168,pmason,Just cover eight opportunity strong policy which.,54,5,2242,True,1,Martinezberg,2021-08-14 22:27:05,ever quickly new I
4,704441,noah87,Animal sign six data good or.,26,3,8438,False,1,Camachoville,2020-04-13 21:24:21,foreign mention


In [84]:
# An issue right off the bat, created at should be Unix epoch time. Convert to datetime, then to int 
df["Created At"] = pd.to_datetime(df["Created At"]).astype("int64") // 10**9 # Convert to seconds
df["Location"] = df["Location"].astype(str)
df["Hashtags"] = df["Hashtags"].astype(str)

print(df.dtypes)

User ID            int64
Username          object
Tweet             object
Retweet Count      int64
Mention Count      int64
Follower Count     int64
Verified            bool
Bot Label          int64
Location          object
Created At         int64
Hashtags          object
dtype: object


In [85]:
# Check for NaN values in the dataset
print(df.isna().sum())

User ID           0
Username          0
Tweet             0
Retweet Count     0
Mention Count     0
Follower Count    0
Verified          0
Bot Label         0
Location          0
Created At        0
Hashtags          0
dtype: int64


In [86]:
# Source: https://tedboy.github.io/nlps/generated/generated/nltk.tokenize.TweetTokenizer.html
from nltk.tokenize import TweetTokenizer, word_tokenize # For tweets and Word-level tokenizer for locations and hashtags, respectively

import nltk
nltk.download('punkt_tab') # Have to download several extra packages to get tokenizer working

'''
Looks like data is already fairly clean, the only NaN column is hashtags
We want our model to generalize, so it should not include the User ID or Username as parameters
The bot label will need to be dropped before feeding data to the model.
Our biggest problem now is the string labels - they can't be one-hot-encoded, so we will need to tokenize them.
'''

df = df.drop(["User ID", "Username"], axis=1)
df.head()

twt_tokenizer = TweetTokenizer()

def tokenize_column(column_name: str):
    assert df[column_name].dtype == 'object', f"Column: {column_name} is not a string."
    df[column_name] = df[column_name].apply(word_tokenize)

tokenize_column("Location")
tokenize_column("Hashtags")

df["Tweet"] = df["Tweet"].apply(twt_tokenizer.tokenize)

df.head()

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\maden\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


Unnamed: 0,Tweet,Retweet Count,Mention Count,Follower Count,Verified,Bot Label,Location,Created At,Hashtags
0,"[Station, activity, person, against, natural, ...",85,1,2353,False,1,[Adkinston],1589210990,[nan]
1,"[Authority, research, natural, life, material,...",55,5,9617,True,0,[Sanderston],1669439890,"[both, live]"
2,"[Manage, whose, quickly, especially, foot, non...",6,2,4363,True,0,[Harrisonfurt],1659928614,"[phone, ahead]"
3,"[Just, cover, eight, opportunity, strong, poli...",54,5,2242,True,1,[Martinezberg],1628980025,"[ever, quickly, new, I]"
4,"[Animal, sign, six, data, good, or, .]",26,3,8438,False,1,[Camachoville],1586813061,"[foreign, mention]"


In [None]:
# Now, convert these tokenized strings to numeric values to be passed into SVM
# Looking at either sklearn TfidfVectorizor or CountVectorizer
# Calling fit() and transform() separately may give the results we want

In [87]:
# Next, we check for collinearity among these features
# Source: https://medium.com/5-minute-eda/5-minute-eda-correlation-heatmap-b57bbb7bae14#:~:text=A%20correlation%20heatmap%20is%20a,1%20signifies%20a%20perfect%20correlation.
sns.set_theme(style="white")
corr = df.corr()
mask = np.zeros_like(corr, dtype=bool) # Array of 0s with same size and dtype as corr
mask[np.triu_indices_from(mask)] = True # Upper triangle set to True
f, ax = plt.subplots(figsize=(11, 9))
cmap = sns.diverging_palette(220, 10, as_cmap=True)
sns.heatmap(corr, mask=mask, cmap=cmap, vmax=1, center=0, square=True, linewidth=.5, sbar_kws={'shrink': .5})
ax.set_title("Collinearity of Features")

ValueError: setting an array element with a sequence.