# Data Preprocessing

This file is the data preprocessing part of the project, submitted to Prof. Soong Moon Kang for MSIN0074 Network Analysis by SRN 22086573.

### Importing Libraries and Loading Data

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import string
import re
import contractions
import random
from sklearn.feature_extraction.text import TfidfVectorizer
from textblob import TextBlob
from nltk.sentiment import SentimentIntensityAnalyzer
from sklearn.feature_extraction.text import CountVectorizer
from langdetect import detect
from collections import Counter
import langdetect


nltk.download('vader_lexicon')
nltk.download("stopwords")
nltk.download("wordnet")
nltk.download('punkt')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/ijeonghyeon/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/ijeonghyeon/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/ijeonghyeon/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/ijeonghyeon/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
df = pd.read_csv("twitter_concat.csv")

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 79105 entries, 0 to 79104
Data columns (total 11 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   id                   79099 non-null  object 
 1   text                 79085 non-null  object 
 2   user_id              79065 non-null  object 
 3   timestamp            79065 non-null  object 
 4   retweet_count        79065 non-null  float64
 5   favorite_count       79055 non-null  float64
 6   in_reply_to_user_id  14908 non-null  object 
 7   twt_hashtags         79065 non-null  object 
 8   user_name            79063 non-null  object 
 9   followers_count      79065 non-null  float64
 10  friends_count        79045 non-null  float64
dtypes: float64(4), object(7)
memory usage: 6.6+ MB


## Data Cleaning

In [4]:
print(df.duplicated().sum())
print(df.isnull().sum())

1080
id                         6
text                      20
user_id                   40
timestamp                 40
retweet_count             40
favorite_count            50
in_reply_to_user_id    64197
twt_hashtags              40
user_name                 42
followers_count           40
friends_count             60
dtype: int64


In [5]:
df = df.drop_duplicates()
df = df.dropna(subset=['text'])

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 78013 entries, 0 to 79104
Data columns (total 11 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   id                   78013 non-null  object 
 1   text                 78013 non-null  object 
 2   user_id              78005 non-null  object 
 3   timestamp            78005 non-null  object 
 4   retweet_count        78005 non-null  float64
 5   favorite_count       77996 non-null  float64
 6   in_reply_to_user_id  14789 non-null  object 
 7   twt_hashtags         78005 non-null  object 
 8   user_name            78003 non-null  object 
 9   followers_count      78005 non-null  float64
 10  friends_count        77986 non-null  float64
dtypes: float64(4), object(7)
memory usage: 7.1+ MB


### Text Preprocessing - Tweets

In [7]:
# Define a list of negation cues
negation_cues = ["not", "n't", "never", "no", "none", "neither", "nor"]

def text_preprocessing(text):
    # Convert to lowercase
    text = str(text)
    text = text.lower()
    # Remove punctuation
    text = text.translate(str.maketrans("", "", string.punctuation))
    # Tokenize the text
    text_tokens = nltk.word_tokenize(text)
    # Handle negation cues
    negated = False
    for i, token in enumerate(text_tokens):
        if token.lower() in negation_cues:
            negated = True
        elif negated:
            text_tokens[i] = "NOT_" + token
            negated = False
    # Remove stop words
    stop_words = set(stopwords.words('english'))
    filtered_text = [word for word in text_tokens if not word in stop_words]
    # Join the filtered words back into a string
    text = ' '.join(filtered_text)
    # Replace contractions with their expanded form
    text = contractions.fix(text)
    return text

tweets = df.text
processed_tweets = []
for text in tweets:
    result = text_preprocessing(text)
    processed_tweets.append(result)
    
processed_tweets = pd.Series(processed_tweets)

lemmatizer = WordNetLemmatizer()

# Define a function that takes a sentence as input and returns a list of lemmas
def lemmatize_nltk(sentence):
    tokens = nltk.word_tokenize(sentence)
    # Perform part-of-speech tagging on the tokens 
    pos_tags = nltk.pos_tag(tokens)
    lemmas = []
    for token, tag in pos_tags:
        # Map the POS tag to the corresponding WordNet POS tag
        tag = get_wordnet_pos(tag)
        if tag:
            lemma = lemmatizer.lemmatize(token, tag)
        else:
            lemma = lemmatizer.lemmatize(token)
        lemmas.append(lemma)
    return lemmas

# Define a function that maps NLTK POS tags to WordNet POS tags
def get_wordnet_pos(tag):
    if tag.startswith('N'):
        return 'n'
    elif tag.startswith('V'):
        return 'v'
    elif tag.startswith('J'):
        return 'a'
    elif tag.startswith('R'):
        return 'r'
    else:
        return None

lemmatized_words = []
for sentence in processed_tweets:
    lemmas = lemmatize_nltk(sentence)
    lemmatized_words.append(lemmas)
    
# Convert the list of lemmatized words to a Series
lemmatised_tweets = pd.Series(lemmatized_words)

In [8]:
# Test with the text_preprocessing model
text = "I don't like the long and boring movie."
text = text_preprocessing(text)
text = lemmatize_nltk(text)
text

['do', 'not', 'like', 'long', 'boring', 'movie']

In [11]:
# Words to remove
remove_words_tw = ["rt","be","de","get","do","use","one","la","en","’","u"] 

# Remove words from each list using list comprehension
tweets = [[word for word in lst if word not in remove_words_tw] for lst in lemmatised_tweets]

# Remove the numbers from each list and clean hashtags for final
preprocessed_tweets = [[word for word in tweet if not str(word).isnumeric()] for tweet in tweets]

In [12]:
pre_tweet_corpus = []

for doc in preprocessed_tweets:
    sentence = " ".join(doc)
    pre_tweet_corpus.append(sentence)

In [13]:
# Detecting the language and extract only english sentences
tweet_corpus = []

for tweet in pre_tweet_corpus:
    try:
        lang = detect(tweet)
        if lang == "en":
            tweet_corpus.append(tweet)
    except:
        pass

In [14]:
tweet_corpus[:5]

['🔥 denet giveaway ！🔥 🏆 reward pools：823925 worth fb token ✅ follow ✅ like amp ✅ complete denet task ⤵️ httpstco3m84n8brg6 🔔tip invite likely cult pinetwork rio airdrop usdc busd bitcoin giveaway denet',
 'bitcoin pump original narrative absolutely love see',
 'adam3us great article cover multiple reason bitcoin adoption network effect usual technology',
 'airdropinspect new airdrop kollect usdt total reward usdt rate ⭐️⭐️⭐️⭐️ winner random amp top distribution within we…',
 'getyafacemelted btc amp crypto look strong af would NOT_surprise test 30k minimum next day bank distress …']

In [15]:
len(tweet_corpus)

59151

### Text preprocessing - Hashtags

In [16]:
hashtags = df.twt_hashtags

processed_hashtags = []
for text in hashtags:
    result = text_preprocessing(text)
    processed_hashtags.append(result)
    
processed_hashtags = pd.Series(processed_hashtags)

lemmatized_hashtags = []
for sentence in processed_hashtags:
    lemmas = lemmatize_nltk(sentence)
    lemmatized_hashtags.append(lemmas)
    
lemmatised_hashtags = pd.Series(lemmatized_hashtags)

In [17]:
# Words to remove
remove_words = ['text', 'indices','index']

# Remove words from each list using list comprehension
hashtags = [[word for word in lst if word not in remove_words] for lst in lemmatised_hashtags]

# Remove the numbers from each list and clean hashtags for final
preprocessed_hashtags = [[word for word in hashtag if not str(word).isnumeric()] for hashtag in hashtags]

In [18]:
pre_hashtag_corpus = []

for doc in preprocessed_hashtags:
    sentence = " ".join(doc)
    pre_hashtag_corpus.append(sentence)

In [19]:
# Detecting the language and extract only english sentences

hashtag_corpus = []

for sentence in pre_hashtag_corpus:
    try:
        lang = langdetect.detect(sentence)
        if lang == "en":
            hashtag_corpus.append(sentence)
    except:
        pass

In [20]:
hashtag_corpus[:5]

['pinetwork airdrop usdc busd bitcoin giveaway denet',
 'btc',
 'bitcoin',
 'bitcoin giveaway',
 'bitcoin']

In [22]:
%store tweet_corpus hashtag_corpus df

Stored 'tweet_corpus' (list)
Stored 'hashtag_corpus' (list)
Stored 'df' (DataFrame)
