In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

citations:
https://www.geeksforgeeks.org/data-analysis/exploratory-data-analysis-in-python/


In [3]:
TWEETS_PATH= "../data/raw_tweets_text.csv"
SENTIMENT_PATH="../data/t4sa_text_sentiment.tsv"


load + small transformations for more readable dataset

In [4]:
#load data
tweets_df = pd.read_csv(TWEETS_PATH, encoding='latin-1', header=0)
sentiment_df= pd.read_csv(SENTIMENT_PATH, sep='\t', header=0)


## Get Basic information

In [5]:
print(tweets_df.shape)
print(sentiment_df.shape)

(3452663, 2)
(1179957, 4)


In [6]:
print(tweets_df.info())
print(sentiment_df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3452663 entries, 0 to 3452662
Data columns (total 2 columns):
 #   Column  Dtype 
---  ------  ----- 
 0   id      int64 
 1   text    object
dtypes: int64(1), object(1)
memory usage: 52.7+ MB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1179957 entries, 0 to 1179956
Data columns (total 4 columns):
 #   Column  Non-Null Count    Dtype  
---  ------  --------------    -----  
 0   TWID    1179957 non-null  int64  
 1   NEG     1179957 non-null  float64
 2   NEU     1179957 non-null  float64
 3   POS     1179957 non-null  float64
dtypes: float64(3), int64(1)
memory usage: 36.0 MB
None


In [7]:

# split second column of tweets into first word, main tweet, link 
# Process in chunks to avoid running out of memory
chunk_size = 10000
num_rows = tweets_df.shape[0]

main_text = []
last_link = []

for start in range(0, num_rows, chunk_size):
    end = min(start + chunk_size, num_rows)
    text_chunk = tweets_df['text'].iloc[start:end].str.split()
    last_link.extend(text_chunk.str[-1].where(text_chunk.str[-1].str.startswith('http'), ''))
    main_text.extend(text_chunk.apply(lambda x: ' '.join(x[:-1]) if x and x[-1].startswith('http') else ' '.join(x) if x else ''))

tweets_df['main_text'] = main_text
tweets_df['last_link'] = last_link



In [8]:
tweets_df.nunique()

id           3452663
text         3176206
main_text    3024375
last_link    3006894
dtype: int64

In [9]:
matching_ids = set(tweets_df['id']).intersection(set(sentiment_df['TWID']))

exclusive_tweets_df = tweets_df[~tweets_df['id'].isin(matching_ids)]
exclusive_sentiment_df = sentiment_df[~sentiment_df['TWID'].isin(matching_ids)]

merged_df = pd.merge(
    exclusive_tweets_df,
    exclusive_sentiment_df,
    left_on='id',
    right_on='TWID',
    how='outer'
)



# EDA section

do your eda here
you probably want to create a copy of the dataset before making any transformations(so you dont have to reload the entire notebook if you make a mistake)

In [10]:
tweets_df_copy = tweets_df.copy()
sentiment_df_copy = sentiment_df.copy()

In [11]:
matching_ids = set(tweets_df['id']).intersection(set(sentiment_df['TWID']))

like_tweets_df = tweets_df[tweets_df['id'].isin(matching_ids)]
like_sentiment_df = sentiment_df[sentiment_df['TWID'].isin(matching_ids)]

merged_df = pd.merge(
    like_tweets_df,
    like_sentiment_df,
    left_on='id',
    right_on='TWID',
    how='inner'
)

In [12]:
merged_df.sort_values(by="id", ascending=True).head()

Unnamed: 0,id,text,main_text,last_link,TWID,NEG,NEU,POS
4,768096868504969216,#Incredible #India #Atulya #Bharat - Land of S...,#Incredible #India #Atulya #Bharat - Land of S...,https://t.co/vpghReZWsa,768096868504969216,0.049398,0.861395,0.089207
965322,768097237620490241,RT @AlwaysTrustKay: Are you near a Western uni...,RT @AlwaysTrustKay: Are you near a Western uni...,httpsâ¦,768097237620490241,0.028733,0.929554,0.041713
8,768097619281227776,RT @KendallHuntRPD: The #firstdayofschool for ...,RT @KendallHuntRPD: The #firstdayofschool for ...,,768097619281227776,0.006598,0.04681,0.946591
10,768097619285536768,RT @abbiesf_: Kate wrights figure is all I wan...,RT @abbiesf_: Kate wrights figure is all I wan...,https://t.co/0AtCwSKo2w,768097619285536768,0.032333,0.850945,0.116722
0,768097627686604801,Josh Jenkins is looking forward to TAB Breeder...,Josh Jenkins is looking forward to TAB Breeder...,https://t.co/ejnA78Sks0,768097627686604801,0.00809,0.042331,0.949579


In [13]:
data = merged_df.copy()
data['text_len'] = merged_df.text.fillna('').astype(str).str.len()
data.head()


Unnamed: 0,id,text,main_text,last_link,TWID,NEG,NEU,POS,text_len
0,768097627686604801,Josh Jenkins is looking forward to TAB Breeder...,Josh Jenkins is looking forward to TAB Breeder...,https://t.co/ejnA78Sks0,768097627686604801,0.00809,0.042331,0.949579,114
1,768097631864102912,RT @2pmthailfans: [Pic] Nichkhun from krjeong8...,RT @2pmthailfans: [Pic] Nichkhun from krjeong8...,https://t.co/5gcAcu9by7,768097631864102912,0.014644,0.926557,0.0588,76
2,768097640278089729,RT @MianUsmanJaved: Congratulations Pakistan o...,RT @MianUsmanJaved: Congratulations Pakistan o...,https://t.co/1oâ¦,768097640278089729,0.004939,0.029469,0.965591,142
3,768097627695042560,"RT @PEPalerts: This September, @YESmag is taki...","RT @PEPalerts: This September, @YESmag is taki...",https://t.co/oXâ¦,768097627695042560,0.006389,0.018663,0.974948,144
4,768096868504969216,#Incredible #India #Atulya #Bharat - Land of S...,#Incredible #India #Atulya #Bharat - Land of S...,https://t.co/vpghReZWsa,768096868504969216,0.049398,0.861395,0.089207,111


In [14]:
l = data[['text_len', 'NEG',  'NEU', 'POS']]

In [None]:
merged_df.duplicated().value_counts()

False    1179957
Name: count, dtype: int64

In [None]:
merged_df['text'].duplicated().value_counts()

text
False    1058954
True      121003
Name: count, dtype: int64

In [None]:
texts_duplicated = merged_df['text'].duplicated()
texts_duplicated

0          False
1          False
2          False
3          False
4          False
           ...  
1179952    False
1179953    False
1179954    False
1179955    False
1179956    False
Name: text, Length: 1179957, dtype: bool

In [None]:
texts_duplicated

0          False
1          False
2          False
3          False
4          False
           ...  
1179952    False
1179953    False
1179954    False
1179955    False
1179956    False
Name: text, Length: 1179957, dtype: bool