In [2]:
"""
Author: Kruti Jayantilal Kotadia
Purpose: Descriptive analysis of Twitter Posts using Pandas, Polars, and Base Python
"""

import pandas as pd
import polars as pl

# Load Twitter Posts CSV
csv_path = "/content/2024_tw_posts_president_scored_anon.csv"


In [3]:
#  Pandas Analysis
try:
    tw_df = pd.read_csv(csv_path, on_bad_lines='skip', encoding='utf-8', engine='python')
    print("\n Twitter Posts Summary (Pandas)")
    print(f"Rows: {tw_df.shape[0]}, Columns: {tw_df.shape[1]}")

    print("\nMissing Values:")
    print(tw_df.isnull().sum())

    print("\nTop 5 Records:")
    print(tw_df.head())

    print("\nNumeric Descriptive Stats:")
    print(tw_df.describe().T)
except Exception as e:
    print(f"Pandas read error: {e}")


 Twitter Posts Summary (Pandas)
Rows: 27304, Columns: 47

Missing Values:
id                                               0
url                                              0
source                                           0
retweetCount                                     0
replyCount                                       0
likeCount                                        0
quoteCount                                       0
viewCount                                        0
createdAt                                        0
lang                                             0
bookmarkCount                                    0
isReply                                          0
isRetweet                                        0
isQuote                                          0
isConversationControlled                         0
quoteId                                      24017
inReplyToId                                  23959
month_year                                       0
illumin

In [4]:
# Polars Analysis
try:
    tw_pl = pl.read_csv(csv_path, ignore_errors=True)
    print("\n Twitter Posts Summary (Polars)")
    print("Shape:", tw_pl.shape)
    print("Schema:", tw_pl.schema)
    print("Null Count:")
    print(tw_pl.null_count())

    print("\nDescriptive Stats:")
    print(tw_pl.describe())
except Exception as e:
    print(f"Polars read error: {e}")


 Twitter Posts Summary (Polars)
Shape: (27304, 47)
Schema: Schema([('id', String), ('url', String), ('source', String), ('retweetCount', Int64), ('replyCount', Int64), ('likeCount', Int64), ('quoteCount', Int64), ('viewCount', Int64), ('createdAt', String), ('lang', String), ('bookmarkCount', Int64), ('isReply', Boolean), ('isRetweet', Boolean), ('isQuote', Boolean), ('isConversationControlled', Boolean), ('quoteId', Float64), ('inReplyToId', Float64), ('month_year', String), ('illuminating_scored_message', String), ('election_integrity_Truth_illuminating', Float64), ('advocacy_msg_type_illuminating', Float64), ('issue_msg_type_illuminating', Float64), ('attack_msg_type_illuminating', Float64), ('image_msg_type_illuminating', Float64), ('cta_msg_type_illuminating', Float64), ('engagement_cta_subtype_illuminating', Float64), ('fundraising_cta_subtype_illuminating', Float64), ('voting_cta_subtype_illuminating', Float64), ('covid_topic_illuminating', Float64), ('economy_topic_illuminatin

In [5]:
# Base Python Analysis
print("\ Base Python Stats (Numeric Columns)")
numeric_cols = tw_df.select_dtypes(include=["int64", "float64"]).columns

for col in numeric_cols:
    values = tw_df[col].dropna().tolist()
    if not values:
        continue
    count = len(values)
    mean_val = sum(values) / count
    print(f"{col}: Count = {count}, Mean = {mean_val:.2f}, Min = {min(values)}, Max = {max(values)}")


\ Base Python Stats (Numeric Columns)
retweetCount: Count = 27304, Mean = 1322.06, Min = 0, Max = 144615
replyCount: Count = 27304, Mean = 1063.79, Min = 0, Max = 121270
likeCount: Count = 27304, Mean = 6913.69, Min = 0, Max = 915221
quoteCount: Count = 27304, Mean = 128.08, Min = 0, Max = 123320
viewCount: Count = 27304, Mean = 507084.73, Min = 5, Max = 333502775
bookmarkCount: Count = 27304, Mean = 136.21, Min = 0, Max = 42693
quoteId: Count = 3287, Mean = 1764298396658922752.00, Min = 7.912639390153769e+17, Max = 1.8535761683325583e+18
inReplyToId: Count = 3345, Mean = 1758285724741985792.00, Min = 1.2400673584227e+18, Max = 1.853530653414859e+18
election_integrity_Truth_illuminating: Count = 26034, Mean = 0.04, Min = 0.0, Max = 1.0
advocacy_msg_type_illuminating: Count = 26034, Mean = 0.56, Min = 0.0, Max = 1.0
issue_msg_type_illuminating: Count = 26034, Mean = 0.51, Min = 0.0, Max = 1.0
attack_msg_type_illuminating: Count = 26034, Mean = 0.31, Min = 0.0, Max = 1.0
image_msg_type_i

**Steps Performed**

Data Loading:

1. The CSV was read using on_bad_lines='skip', engine='python', and encoding='utf-8' in Pandas for safety.

2. Also loaded using Polars with ignore_errors=True.

Pandas Analysis

1. Displayed row/column counts

2. Counted missing values

3. Showed the top 5 rows

4. Generated summary statistics (mean, std, min, max, etc.)

Polars Analysis

1. Inspected shape and schema

2. Displayed null counts using null_count()

3. Used describe() to generate fast summary stats

Base Python Summary

1. Manually computed for each numeric column:

2. Total count

3. Mean

4. Minimum and maximum values

