# Day 1

In [2]:
import pandas as pd

# Load training data
training_data = pd.read_csv('../data/twitter_training.csv', header=None) # Missing headers in data set

# Load validation data
validation_data = pd.read_csv('../data/twitter_validation.csv', header=None) # Missing headers in data set

In [3]:
def summarize_dataframe(df: pd.DataFrame, df_name: str='DataFrame'):
    """
    Summarize the DataFrame by displaying its shape, missing values,
    data types, and duplicate values.

    Parameters:
        df (pd.DataFrame): The DataFrame to summarize.
        df_name (str): A name for the DataFrame, used in output messages.
    """
    print(f"Summary for {df_name}")
    print(f"Shape: {df.shape}")
    print(f"\nMissing Values:\n{df.isnull().sum()}")
    print(f"\nData Types:\n{df.dtypes}")
    print(f"\nDescriptive Statistics:\n{df.describe(include='all')}")
    print(f"\nUnique Values:\n{df.nunique()}")

    duplicate_count = df.duplicated().sum()
    if duplicate_count > 0:
        print(f"\nDuplicate Rows: {duplicate_count}")
        print(f"Examples of Duplicate Rows:\n{df[df.duplicated(keep=False)].head()}")
    else:
        print("\nNo Duplicate Rows Found.")

In [4]:
summarize_dataframe(training_data, 'Training Data')

Summary for Training Data
Shape: (24004, 4)

Missing Values:
0      0
1      0
2      0
3    220
dtype: int64

Data Types:
0     int64
1    object
2    object
3    object
dtype: object

Descriptive Statistics:
                   0                          1         2      3
count   24004.000000                      24004     24004  23784
unique           NaN                         11         4  22357
top              NaN  CallOfDutyBlackopsColdWar  Positive       
freq             NaN                       2376      7278     48
mean     6472.156641                        NaN       NaN    NaN
std      4402.326191                        NaN       NaN    NaN
min         1.000000                        NaN       NaN    NaN
25%      2629.000000                        NaN       NaN    NaN
50%      4461.000000                        NaN       NaN    NaN
75%      9777.000000                        NaN       NaN    NaN
max     13200.000000                        NaN       NaN    NaN

Unique Va

In [5]:
training_data.head()

Unnamed: 0,0,1,2,3
0,2401,Borderlands,Positive,im getting on borderlands and i will murder yo...
1,2401,Borderlands,Positive,I am coming to the borders and I will kill you...
2,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...
3,2401,Borderlands,Positive,im coming on borderlands and i will murder you...
4,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...


In [6]:
validation_data.head()

Unnamed: 0,0,1,2,3
0,3364,Facebook,Irrelevant,I mentioned on Facebook that I was struggling ...
1,352,Amazon,Neutral,BBC News - Amazon boss Jeff Bezos rejects clai...
2,8312,Microsoft,Negative,@Microsoft Why do I pay for WORD when it funct...
3,4371,CS-GO,Negative,"CSGO matchmaking is so full of closet hacking,..."
4,4433,Google,Neutral,Now the President is slapping Americans in the...


In [7]:
# Rename the column names to correctly describe the data
training_data.columns = ['tweet_id', 'entity', 'sentiment', 'tweet_content']
validation_data.columns = ['tweet_id', 'entity', 'sentiment', 'tweet_content']

In [8]:
training_data.head()

Unnamed: 0,tweet_id,entity,sentiment,tweet_content
0,2401,Borderlands,Positive,im getting on borderlands and i will murder yo...
1,2401,Borderlands,Positive,I am coming to the borders and I will kill you...
2,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...
3,2401,Borderlands,Positive,im coming on borderlands and i will murder you...
4,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...


In [9]:
validation_data.head()

Unnamed: 0,tweet_id,entity,sentiment,tweet_content
0,3364,Facebook,Irrelevant,I mentioned on Facebook that I was struggling ...
1,352,Amazon,Neutral,BBC News - Amazon boss Jeff Bezos rejects clai...
2,8312,Microsoft,Negative,@Microsoft Why do I pay for WORD when it funct...
3,4371,CS-GO,Negative,"CSGO matchmaking is so full of closet hacking,..."
4,4433,Google,Neutral,Now the President is slapping Americans in the...


In [10]:
summarize_dataframe(training_data, 'Training Data')

Summary for Training Data
Shape: (24004, 4)

Missing Values:
tweet_id           0
entity             0
sentiment          0
tweet_content    220
dtype: int64

Data Types:
tweet_id          int64
entity           object
sentiment        object
tweet_content    object
dtype: object

Descriptive Statistics:
            tweet_id                     entity sentiment tweet_content
count   24004.000000                      24004     24004         23784
unique           NaN                         11         4         22357
top              NaN  CallOfDutyBlackopsColdWar  Positive              
freq             NaN                       2376      7278            48
mean     6472.156641                        NaN       NaN           NaN
std      4402.326191                        NaN       NaN           NaN
min         1.000000                        NaN       NaN           NaN
25%      2629.000000                        NaN       NaN           NaN
50%      4461.000000                        Na

In [11]:
summarize_dataframe(validation_data, 'Validation Data')

Summary for Validation Data
Shape: (1000, 4)

Missing Values:
tweet_id         0
entity           0
sentiment        0
tweet_content    0
dtype: int64

Data Types:
tweet_id          int64
entity           object
sentiment        object
tweet_content    object
dtype: object

Descriptive Statistics:
            tweet_id                  entity sentiment tweet_content
count    1000.000000                    1000      1000          1000
unique           NaN                      32         4           999
top              NaN  RedDeadRedemption(RDR)   Neutral           Wow
freq             NaN                      40       285             2
mean     6432.088000                     NaN       NaN           NaN
std      3728.310569                     NaN       NaN           NaN
min         6.000000                     NaN       NaN           NaN
25%      3247.750000                     NaN       NaN           NaN
50%      6550.000000                     NaN       NaN           NaN
75%      96

---

# Day 2

In [13]:
# Drop missing & duplicate data from training set
training_data.dropna(inplace=True)
training_data.drop_duplicates(inplace=True)

In [14]:
summarize_dataframe(training_data, 'Training Data')

Summary for Training Data
Shape: (22949, 4)

Missing Values:
tweet_id         0
entity           0
sentiment        0
tweet_content    0
dtype: int64

Data Types:
tweet_id          int64
entity           object
sentiment        object
tweet_content    object
dtype: object

Descriptive Statistics:
            tweet_id        entity sentiment  \
count   22949.000000         22949     22949   
unique           NaN            11         4   
top              NaN  WorldOfCraft  Positive   
freq             NaN          2300      6888   
mean     6473.488431           NaN       NaN   
std      4405.392822           NaN       NaN   
min         1.000000           NaN       NaN   
25%      2626.000000           NaN       NaN   
50%      4466.000000           NaN       NaN   
75%      9777.000000           NaN       NaN   
max     13200.000000           NaN       NaN   

                                            tweet_content  
count                                               22949  
uniqu

In [40]:
print(validation_data['tweet_content'].sample().values[0])

Good Job @Blizzard_Ent 😡 @BlizzLatAm @Warcraft pic.twitter.com/zyoT3FLEn0


In [43]:
# Goal: Clean tweet content of URLs, hashtags, mentions, special chars/numbers
# remove stopwords (i.e. 'a', 'an', 'the', etc.)
import re # Regular Expression
import nltk # Natural Language Toolkit
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [44]:
def clean_text(text):
    text = re.sub(r'http\S+', '', text)  # Remove URLs
    text = re.sub(r'#[A-Za-z0-9_]+', '', text)  # Remove hashtags
    text = re.sub(r'@[A-Za-z0-9_]+', '', text)  # Remove mentions
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Remove special characters and numbers
    text = text.lower()  # Convert to lowercase
    stop_words = set(stopwords.words('english'))
    words = text.split()
    words = [word for word in words if word not in stop_words]  # Remove stop words
    return ' '.join(words)

In [45]:
training_data['cleaned_tweet_content'] = training_data['tweet_content'].apply(clean_text)
validation_data['cleaned_tweet_content'] = validation_data['tweet_content'].apply(clean_text)

In [48]:
training_data.to_csv('../data/cleaned_twitter_training.csv')
validation_data.to_csv('../data/cleaned_twitter_validation.csv')

In [None]:
|