In [73]:
import pandas as pd

# Load training data
training_data = pd.read_csv('../data/twitter_training.csv', header=None) # Missing headers in data set

# Load validation data
validation_data = pd.read_csv('../data/twitter_validation.csv', header=None) # Missing headers in data set

In [74]:
def summarize_dataframe(df: pd.DataFrame, df_name: str='DataFrame'):
    """
    Summarize the DataFrame by displaying its shape, missing values,
    data types, and duplicate values.

    Parameters:
        df (pd.DataFrame): The DataFrame to summarize.
        df_name (str): A name for the DataFrame, used in output messages.
    """
    print(f"Summary for {df_name}")
    print(f"Shape: {df.shape}")
    print(f"\nMissing Values:\n{df.isnull().sum()}")
    print(f"\nData Types:\n{df.dtypes}")
    print(f"\nDescriptive Statistics:\n{df.describe(include='all')}")
    print(f"\nUnique Values:\n{df.nunique()}")

    duplicate_count = df.duplicated().sum()
    if duplicate_count > 0:
        print(f"\nDuplicate Rows: {duplicate_count}")
        print(f"Examples of Duplicate Rows:\n{df[df.duplicated(keep=False)].head()}")
    else:
        print("\nNo Duplicate Rows Found.")

In [75]:
summarize_dataframe(training_data, 'Training Data')

Summary for Training Data
Shape: (74682, 4)

Missing Values:
0      0
1      0
2      0
3    686
dtype: int64

Data Types:
0     int64
1    object
2    object
3    object
dtype: object

Descriptive Statistics:
                   0                     1         2  \
count   74682.000000                 74682     74682   
unique           NaN                    32         4   
top              NaN  TomClancysRainbowSix  Negative   
freq             NaN                  2400     22542   
mean     6432.586165                   NaN       NaN   
std      3740.427870                   NaN       NaN   
min         1.000000                   NaN       NaN   
25%      3195.000000                   NaN       NaN   
50%      6422.000000                   NaN       NaN   
75%      9601.000000                   NaN       NaN   
max     13200.000000                   NaN       NaN   

                                                        3  
count                                               73996

In [76]:
training_data.head()

Unnamed: 0,0,1,2,3
0,2401,Borderlands,Positive,im getting on borderlands and i will murder yo...
1,2401,Borderlands,Positive,I am coming to the borders and I will kill you...
2,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...
3,2401,Borderlands,Positive,im coming on borderlands and i will murder you...
4,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...


In [77]:
validation_data.head()

Unnamed: 0,0,1,2,3
0,3364,Facebook,Irrelevant,I mentioned on Facebook that I was struggling ...
1,352,Amazon,Neutral,BBC News - Amazon boss Jeff Bezos rejects clai...
2,8312,Microsoft,Negative,@Microsoft Why do I pay for WORD when it funct...
3,4371,CS-GO,Negative,"CSGO matchmaking is so full of closet hacking,..."
4,4433,Google,Neutral,Now the President is slapping Americans in the...


In [78]:
# Rename the column names to correctly describe the data
training_data.columns = ['tweet_id', 'entity', 'sentiment', 'tweet_content']
validation_data.columns = ['tweet_id', 'entity', 'sentiment', 'tweet_content']

In [79]:
training_data.head()

Unnamed: 0,tweet_id,entity,sentiment,tweet_content
0,2401,Borderlands,Positive,im getting on borderlands and i will murder yo...
1,2401,Borderlands,Positive,I am coming to the borders and I will kill you...
2,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...
3,2401,Borderlands,Positive,im coming on borderlands and i will murder you...
4,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...


In [80]:
validation_data.head()

Unnamed: 0,tweet_id,entity,sentiment,tweet_content
0,3364,Facebook,Irrelevant,I mentioned on Facebook that I was struggling ...
1,352,Amazon,Neutral,BBC News - Amazon boss Jeff Bezos rejects clai...
2,8312,Microsoft,Negative,@Microsoft Why do I pay for WORD when it funct...
3,4371,CS-GO,Negative,"CSGO matchmaking is so full of closet hacking,..."
4,4433,Google,Neutral,Now the President is slapping Americans in the...


In [81]:
summarize_dataframe(training_data, 'Training Data')

Summary for Training Data
Shape: (74682, 4)

Missing Values:
tweet_id           0
entity             0
sentiment          0
tweet_content    686
dtype: int64

Data Types:
tweet_id          int64
entity           object
sentiment        object
tweet_content    object
dtype: object

Descriptive Statistics:
            tweet_id                entity sentiment  \
count   74682.000000                 74682     74682   
unique           NaN                    32         4   
top              NaN  TomClancysRainbowSix  Negative   
freq             NaN                  2400     22542   
mean     6432.586165                   NaN       NaN   
std      3740.427870                   NaN       NaN   
min         1.000000                   NaN       NaN   
25%      3195.000000                   NaN       NaN   
50%      6422.000000                   NaN       NaN   
75%      9601.000000                   NaN       NaN   
max     13200.000000                   NaN       NaN   

                     

In [82]:
summarize_dataframe(validation_data, 'Validation Data')

Summary for Validation Data
Shape: (1000, 4)

Missing Values:
tweet_id         0
entity           0
sentiment        0
tweet_content    0
dtype: int64

Data Types:
tweet_id          int64
entity           object
sentiment        object
tweet_content    object
dtype: object

Descriptive Statistics:
            tweet_id                  entity sentiment tweet_content
count    1000.000000                    1000      1000          1000
unique           NaN                      32         4           999
top              NaN  RedDeadRedemption(RDR)   Neutral           Wow
freq             NaN                      40       285             2
mean     6432.088000                     NaN       NaN           NaN
std      3728.310569                     NaN       NaN           NaN
min         6.000000                     NaN       NaN           NaN
25%      3247.750000                     NaN       NaN           NaN
50%      6550.000000                     NaN       NaN           NaN
75%      96