# Table Edits

This notebook aims to modify the structure and some data from the table such that it is more insightful in predictions.

In [1]:
import pandas as pd

## Load the Data

In [2]:
df_not_annotated_full = pd.read_csv('sentimentdataset_RAW.csv')
df_multi_class_full = pd.read_csv('sentimentdataset_annotated.csv')
df_binary_class_full = pd.read_csv('sentimentdataset_annotated_binary.csv')
df_multi_class_sensitive_attributes = pd.read_csv('sentimentdataset_multi_class_sensitive_attributes.csv')
df_binary_class_sensitive_attributes = pd.read_csv('sentimentdataset_binary_class_sensitive_attributes.csv')
df_multi_class_non_sensitive_attributes = pd.read_csv('sentimentdataset_multi_class_non_sensitive_attributes.csv')
df_binary_class_non_sensitive_attributes = pd.read_csv('sentimentdataset_binary_class_non_sensitive_attributes.csv')

In [3]:
print(f"The binary class full dataframe has {len(df_binary_class_full.columns)} columns.")
print(f"The binary class non-sensitive attributes dataframe has {len(df_binary_class_non_sensitive_attributes.columns)} columns.")
print(f"The binary class sensitive attributes dataframe has {len(df_binary_class_sensitive_attributes.columns)} columns.")
print(f"The multi-class full dataframe has {len(df_multi_class_full.columns)} columns.")
print(f"The multi-class non-sensitive attributes dataframe has {len(df_multi_class_non_sensitive_attributes.columns)} columns.")
print(f"The multi-class sensitive attributes dataframe has {len(df_multi_class_sensitive_attributes.columns)} columns.")
print(f"The not annotated full dataframe has {len(df_not_annotated_full.columns)} columns.")

The binary class full dataframe has 16 columns.
The binary class non-sensitive attributes dataframe has 13 columns.
The binary class sensitive attributes dataframe has 3 columns.
The multi-class full dataframe has 16 columns.
The multi-class non-sensitive attributes dataframe has 13 columns.
The multi-class sensitive attributes dataframe has 3 columns.
The not annotated full dataframe has 15 columns.


In [5]:
sensitive_attributes = [
    "User",
    "Platform",
    "Country"
]

## Remove Meaningless Columns

In [11]:
def remove_single_value_columns(df: pd.DataFrame, includeList: list = []) -> int:
    dropped_count = 0
    for column in df.columns:
        uniqueValues = df[column].unique()
        if len(uniqueValues) <= 1 and column not in includeList:
            # Remove Useless Columns
            df.drop(column, axis=1, inplace=True)
            dropped_count += 1
    return dropped_count

In [12]:
# Apply the function to all dataframes and print the dropped counts
dropped_count_not_annotated_full = remove_single_value_columns(df_not_annotated_full, sensitive_attributes)
print(f"Dropped {dropped_count_not_annotated_full} columns from df_not_annotated_full")

dropped_count_multi_class_full = remove_single_value_columns(df_multi_class_full, sensitive_attributes)
print(f"Dropped {dropped_count_multi_class_full} columns from df_multi_class_full")

dropped_count_binary_class_full = remove_single_value_columns(df_binary_class_full, sensitive_attributes)
print(f"Dropped {dropped_count_binary_class_full} columns from df_binary_class_full")

dropped_count_multi_class_sensitive_attributes = remove_single_value_columns(df_multi_class_sensitive_attributes, sensitive_attributes)
print(f"Dropped {dropped_count_multi_class_sensitive_attributes} columns from df_multi_class_sensitive_attributes")

dropped_count_binary_class_sensitive_attributes = remove_single_value_columns(df_binary_class_sensitive_attributes, sensitive_attributes)
print(f"Dropped {dropped_count_binary_class_sensitive_attributes} columns from df_binary_class_sensitive_attributes")

dropped_count_multi_class_non_sensitive_attributes = remove_single_value_columns(df_multi_class_non_sensitive_attributes, sensitive_attributes)
print(f"Dropped {dropped_count_multi_class_non_sensitive_attributes} columns from df_multi_class_non_sensitive_attributes")

dropped_count_binary_class_non_sensitive_attributes = remove_single_value_columns(df_binary_class_non_sensitive_attributes, sensitive_attributes)
print(f"Dropped {dropped_count_binary_class_non_sensitive_attributes} columns from df_binary_class_non_sensitive_attributes")

Dropped 0 columns from df_not_annotated_full
Dropped 0 columns from df_multi_class_full
Dropped 0 columns from df_binary_class_full
Dropped 0 columns from df_multi_class_sensitive_attributes
Dropped 0 columns from df_binary_class_sensitive_attributes
Dropped 0 columns from df_multi_class_non_sensitive_attributes
Dropped 0 columns from df_binary_class_non_sensitive_attributes


Conclusion: All columns are important in this dataset.

## Replace NaN Values

In [13]:
def replace_nan_values(df: pd.DataFrame) -> pd.DataFrame:
    for column in df.columns:
        if df[column].dtype == 'object':
            # Replace NaN values with empty string for categorical columns
            df[column].fillna('', inplace=True)
        else:
            # Replace NaN values with 0 for numerical columns
            df[column].fillna(0, inplace=True)
    print("NaN values replaced")
    return df

In [14]:
df_binary_class_full = replace_nan_values(df_binary_class_full)
df_binary_class_non_sensitive_attributes = replace_nan_values(df_binary_class_non_sensitive_attributes)
df_binary_class_sensitive_attributes = replace_nan_values(df_binary_class_sensitive_attributes)
df_multi_class_full = replace_nan_values(df_multi_class_full)
df_multi_class_non_sensitive_attributes = replace_nan_values(df_multi_class_non_sensitive_attributes)
df_multi_class_sensitive_attributes = replace_nan_values(df_multi_class_sensitive_attributes)
df_not_annotated_full = replace_nan_values(df_not_annotated_full)

NaN values replaced
NaN values replaced
NaN values replaced
NaN values replaced
NaN values replaced
NaN values replaced
NaN values replaced


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[column].fillna(0, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[column].fillna('', inplace=True)


In [15]:
# Save to CSV 
df_binary_class_full.to_csv('sentimentdataset_annotated_binary.csv', index=False)
df_binary_class_non_sensitive_attributes.to_csv('sentimentdataset_binary_class_non_sensitive_attributes.csv', index=False)
df_binary_class_sensitive_attributes.to_csv('sentimentdataset_binary_class_sensitive_attributes.csv', index=False)
df_multi_class_full.to_csv('sentimentdataset_annotated.csv', index=False)
df_multi_class_non_sensitive_attributes.to_csv('sentimentdataset_multi_class_non_sensitive_attributes.csv', index=False)
df_multi_class_sensitive_attributes.to_csv('sentimentdataset_multi_class_sensitive_attributes.csv', index=False)
df_not_annotated_full.to_csv('sentimentdataset.csv', index=False)
print("Dataframes saved to CSV")

Dataframes saved to CSV
