In [27]:
# Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re

In [28]:
TWEETS_PATH= "../data/raw_tweets_text.csv"
SENTIMENT_PATH="../data/t4sa_text_sentiment.tsv"

#load data
tweets_df = pd.read_csv(TWEETS_PATH, encoding='latin-1', header=0)
sentiment_df= pd.read_csv(SENTIMENT_PATH, sep='\t', header=0)

# Basic Information

In [29]:
print(tweets_df.shape)
print(sentiment_df.shape)

(3452663, 2)
(1179957, 4)


In [30]:
print(tweets_df.head())
print(sentiment_df.head())

                   id                                               text
0  758014713804587008  RT @polarcomic: And surprise! the #RegularShow...
1  758014717990428672  RT @SweetBabyBellB: My unproblematic fav who k...
2  758014646716665857  RT @WhyLarryIsReal: I mean we know harry isn't...
3  758014655071526912  RT @Eastbay: She's ready, resilient, and on ou...
4  758014642526429184  RT @SheeeRatchet: find someone who loves you a...
                 TWID       NEG       NEU       POS
0  768096868504969216  0.049398  0.861395  0.089207
1  768097237620490241  0.028733  0.929554  0.041713
2  768097619281227776  0.006598  0.046810  0.946591
3  768097619285536768  0.032333  0.850945  0.116722
4  768097627686604801  0.008090  0.042331  0.949579


In [31]:
print(tweets_df.describe())
print(sentiment_df.describe())

                 id
count  3.452663e+06
mean   7.865426e+17
std    1.381574e+16
min    7.580146e+17
25%    7.695707e+17
50%    7.862700e+17
75%    7.993751e+17
max    8.046194e+17
               TWID           NEG           NEU           POS
count  1.179957e+06  1.179957e+06  1.179957e+06  1.179957e+06
mean   7.860716e+17  1.214643e-01  5.272504e-01  3.512854e-01
std    1.386547e+16  2.489799e-01  3.953345e-01  3.879292e-01
min    7.680969e+17  2.930239e-14  2.250815e-03  2.441870e-14
25%    7.692905e+17  1.118029e-02  8.237851e-02  7.404817e-02
50%    7.839379e+17  1.924086e-02  8.548171e-01  1.057651e-01
75%    7.996407e+17  3.546559e-02  8.904971e-01  8.860867e-01
max    8.046194e+17  9.939882e-01  1.000000e+00  9.965788e-01


# Cleaning the Data

In [32]:


print("Duplicate amounts in tweets_df:")
print(tweets_df['id'].duplicated().sum()) # There are no duplicates in either dataset

print("Duplicate amounts in sentiment_df:")
print(sentiment_df.duplicated().sum())

# Check for missing values in tweets_df
print("Missing values in tweets_df:")
print(tweets_df.isna().sum())

# Check for missing values in sentiment_df
print("\nMissing values in sentiment_df:")
print(sentiment_df.isna().sum())



Duplicate amounts in tweets_df:
0
Duplicate amounts in sentiment_df:
0
Missing values in tweets_df:
id      0
text    0
dtype: int64

Missing values in sentiment_df:
TWID    0
NEG     0
NEU     0
POS     0
dtype: int64


In [33]:
merged_df = pd.merge(tweets_df, sentiment_df, left_on='id', right_on='TWID')
merged_df = merged_df.drop(columns=['TWID']) # since its alr in id

main_df = merged_df.copy()

In [86]:
# Useful functions to help extract data from the columns

def extract_username_from_text(text):
    match = re.search(r'^RT @([^\s:]+):', text)
    if match:
        return match.group(1)
    else:
        return None
    
def extract_links_from_text(text):
    urls = re.findall(r'https?://\S+', text)  # finds all URLs
    if not urls:
        return None
    
    if len(urls) == 1:
        return urls[0]
    else:
        return urls
        


In [87]:
# Start adding extra columns that might help us with visualizations
main_df['is_retweet'] = main_df['text'].str.startswith('RT ')
main_df['username'] = main_df['text'].apply(extract_username_from_text)
# add links
main_df['url'] = main_df['text'].apply(extract_links_from_text)
# add cleaned_text
# add hashtags
# add mentions
main_df.tail(50)


Unnamed: 0,id,text,NEG,NEU,POS,is_retweet,username,url
1179907,804609958360838144,I am now live on webcam find me here &gt;&gt; ...,0.017945,0.930804,0.05125,False,,"[https://t.co/yg0pJss4MK, https://t.co/QMXtTx4..."
1179908,804610000320626688,Allan Bloom~ Education is the movement from da...,0.020289,0.904304,0.075407,False,,https://t.co/era0R3l1Bp
1179909,804610537191575552,It's lit on the blog #2016MAMA https://t.co/TY...,0.037687,0.889395,0.072918,False,,https://t.co/TYsHO2iqZ3
1179910,804610834957791236,Im a Boxer. Discover which #dog breed you are!...,0.018554,0.868268,0.113179,False,,"[https://t.co/GiiIhtvgWm, https://t.co/h9wJClL..."
1179911,804611069859758080,National Mutt Day - Happy #NationalMuttDay! ht...,0.024677,0.097129,0.878194,False,,https://t.co/zq6pzVkPfO
1179912,804611401201356800,Click here for more Pictures: https://t.co/dCl...,0.033407,0.866509,0.100084,False,,"[https://t.co/dClI6OLUieAdd, https://t.co/asFv..."
1179913,804611447326064640,Happy Holidaze from Mother Nature &amp; the Ea...,0.014756,0.051565,0.933678,False,,https://t.co/qrsRc3NbvN
1179914,804611602515230720,#Turkey #Yemen 21 years of imprisonment for re...,0.058305,0.878526,0.063169,False,,"[https://t.co/FmotkuxiOs, https://t.co/kjtWdst..."
1179915,804612617549606912,USB-C Flash Drive for new Apple MacBook -&gt; ...,0.046054,0.854462,0.099485,False,,"[https://t.co/fQOFQuahRI, https://t.co/UXLOetT..."
1179916,804612634347732992,If you have been wondering how to find a worki...,0.038333,0.852796,0.108871,False,,"[https://t.co/oZrdNzbK1B, https://t.co/fHEG0Mi..."
