In [1]:
import pandas as pd

In [2]:
def load_csv(filepath: str) -> pd.DataFrame:
    """
    Loads the given CSV file into a DataFrame.

    Args:
        filepath (str): Path to the CSV file.

    Returns:
        pd.DataFrame: Loaded DataFrame.
    """
    try:
        data = pd.read_csv(filepath, engine="python")
        print(f"Data successfully loaded from {filepath}")
        return data
    except Exception as e:
        print(f"Error while reading the CSV file: {e}")
        return None

In [3]:
filepath = "C:/Users/richm/OneDrive/Desktop/DSA4264/DSA4264-Detoxify/kedro-data/data/01_raw/Reddit-Threads_2020-2021.csv"  # Replace with your CSV file path
data = load_csv(filepath)

Data successfully loaded from C:/Users/richm/OneDrive/Desktop/DSA4264/DSA4264-Detoxify/kedro-data/data/01_raw/Reddit-Threads_2020-2021.csv


In [4]:
filepath1 = "C:/Users/richm/OneDrive/Desktop/DSA4264/DSA4264-Detoxify/kedro-data/data/02_intermediate/removed_deleted_comments_data.csv/2024-10-03T03.55.53.397Z/removed_deleted_comments_data.csv"  # Replace with your CSV file path
data_removed = load_csv(filepath1)

Data successfully loaded from C:/Users/richm/OneDrive/Desktop/DSA4264/DSA4264-Detoxify/kedro-data/data/02_intermediate/removed_deleted_comments_data.csv/2024-10-03T03.55.53.397Z/removed_deleted_comments_data.csv


In [5]:
filepath2 = "C:/Users/richm/OneDrive/Desktop/DSA4264/DSA4264-Detoxify/kedro-data/data/02_intermediate/removed_nan_comments_data.csv/2024-10-03T03.55.53.397Z/removed_nan_comments_data.csv"  # Replace with your CSV file path
data_nan = load_csv(filepath2)

Data successfully loaded from C:/Users/richm/OneDrive/Desktop/DSA4264/DSA4264-Detoxify/kedro-data/data/02_intermediate/removed_nan_comments_data.csv/2024-10-03T03.55.53.397Z/removed_nan_comments_data.csv


In [6]:
def inspect_csv(data: pd.DataFrame):
    """
    Inspects the given DataFrame by displaying basic information.

    Args:
        data (pd.DataFrame): DataFrame to inspect.

    Returns:
        None
    """
    if data is None:
        print("No data to inspect.")
        return

    # Display basic information about the data
    print("\n--- Basic Information ---")
    print(data.info())

    # Display the first 5 rows of the CSV file
    print("\n--- First 5 Rows ---")
    print(data.head())

    # Display the column names
    print("\n--- Column Names ---")
    print(data.columns)

    # Check for missing values
    print("\n--- Missing Values ---")
    print(data.isnull().sum())

    # Check for duplicates
    print("\n--- Duplicates ---")
    duplicates = data.duplicated().sum()
    print(f"Number of duplicate rows: {duplicates}")

    # Display basic statistics for numerical columns
    print("\n--- Basic Statistics ---")
    print(data.describe())

In [7]:
inspect_csv(data)


--- Basic Information ---
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2949420 entries, 0 to 2949419
Data columns (total 9 columns):
 #   Column        Dtype 
---  ------        ----- 
 0   text          object
 1   timestamp     object
 2   username      object
 3   link          object
 4   link_id       object
 5   parent_id     object
 6   id            object
 7   subreddit_id  object
 8   moderation    object
dtypes: object(9)
memory usage: 202.5+ MB
None

--- First 5 Rows ---
                                                text            timestamp  \
0                                      STI chiong ah  2020-05-14 12:35:30   
1  Look on the bright side - you'll never make th...  2020-02-09 17:23:24   
2  For posts flaired as such (by OP), we will be ...  2021-04-06 18:08:59   
3  sounds q fucked up if no concern for each othe...  2021-01-22 14:22:42   
4  Chinese media reported a while ago: https://ww...  2020-03-26 04:51:22   

         username                          

In [8]:
inspect_csv(data_removed)


--- Basic Information ---
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2663811 entries, 0 to 2663810
Data columns (total 9 columns):
 #   Column        Dtype 
---  ------        ----- 
 0   text          object
 1   timestamp     object
 2   username      object
 3   link          object
 4   link_id       object
 5   parent_id     object
 6   id            object
 7   subreddit_id  object
 8   moderation    object
dtypes: object(9)
memory usage: 182.9+ MB
None

--- First 5 Rows ---
                                                text            timestamp  \
0                                      STI chiong ah  2020-05-14 12:35:30   
1  Look on the bright side - you'll never make th...  2020-02-09 17:23:24   
2  For posts flaired as such (by OP), we will be ...  2021-04-06 18:08:59   
3  sounds q fucked up if no concern for each othe...  2021-01-22 14:22:42   
4  Chinese media reported a while ago: https://ww...  2020-03-26 04:51:22   

         username                          

In [9]:
inspect_csv(data_nan)


--- Basic Information ---
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2663779 entries, 0 to 2663778
Data columns (total 9 columns):
 #   Column        Dtype 
---  ------        ----- 
 0   text          object
 1   timestamp     object
 2   username      object
 3   link          object
 4   link_id       object
 5   parent_id     object
 6   id            object
 7   subreddit_id  object
 8   moderation    object
dtypes: object(9)
memory usage: 182.9+ MB
None

--- First 5 Rows ---
                                                text            timestamp  \
0                                      STI chiong ah  2020-05-14 12:35:30   
1  Look on the bright side - you'll never make th...  2020-02-09 17:23:24   
2  For posts flaired as such (by OP), we will be ...  2021-04-06 18:08:59   
3  sounds q fucked up if no concern for each othe...  2021-01-22 14:22:42   
4  Chinese media reported a while ago: https://ww...  2020-03-26 04:51:22   

         username                          

In [10]:
data.loc[10]

text                                                    [removed]
timestamp                                     2020-11-11 02:00:20
username                                                [deleted]
link              /r/singapore/comments/jrlfml/honeycomb/gbw6lao/
link_id                                                 t3_jrlfml
parent_id                                               t3_jrlfml
id                                                        gbw6lao
subreddit_id                                             t5_2qh8c
moderation      {'removal_reason': None, 'collapsed': True, 'c...
Name: 10, dtype: object

In [11]:
data_removed.loc[10]

text            Wow that's really great of him! And brave of y...
timestamp                                     2020-08-05 08:58:27
username                                               Nephthys88
link            /r/singapore/comments/i3y4l1/youngsters_who_ha...
link_id                                                 t3_i3y4l1
parent_id                                              t1_g0f7sax
id                                                        g0fc7zc
subreddit_id                                             t5_2qh8c
moderation      {'removal_reason': None, 'collapsed': False, '...
Name: 10, dtype: object