# Data Cleaning & EDA

In [1]:
# Imports
import os
import pandas as pd
import numpy as np
import matplotlib as plt
import seaborn as sns
from functions import *

# Paths
save_data = '../data'
save_images = '../images'

if os.path.exists(save_data):
  print('Data: Save point initialized.')
if os.path.exists(save_images):
  print('Images: Save point initialized.')

Data: Save point initialized.
Images: Save point initialized.


In [None]:
"""
from datasets import load_dataset

ds = load_dataset("valurank/PoliticalBias_AllSides_Txt")

ds
"""

In [3]:
# Downloaded the labeled dataset manually.
datasets_path = save_data + '/AllSides'

if os.path.exists(datasets_path):
  print('Datasets: Save point initialized.\n')
  print(os.listdir(datasets_path))

Datasets: Save point initialized.

['Center Data', 'Left Data', 'Right Data']


In [4]:
ds_center_path = datasets_path + '/Center Data'
ds_left_path = datasets_path + '/Left Data'
ds_right_path = datasets_path + '/Right Data'

if os.path.exists(ds_left_path):
  print(f'Left Data: Save point initialized: {len(os.listdir(ds_left_path))} files')
if os.path.exists(ds_center_path):
  print(f'Center Data: Save point initialized: {len(os.listdir(ds_center_path))} files')
if os.path.exists(ds_right_path):
  print(f'Right Data: Save point initialized: {len(os.listdir(ds_right_path))} files')

Left Data: Save point initialized: 7803 files
Center Data: Save point initialized: 3996 files
Right Data: Save point initialized: 5563 files


In [5]:
# Convert to DataFrame
df_l = pd.DataFrame() # Left, 0
df_c = pd.DataFrame() # Center, 1
df_r = pd.DataFrame() # Right, 2

# Loops
for file in os.listdir(ds_left_path): # Left
    path = os.path.join(ds_left_path, file)
    with open(path, 'r') as f:
        content = f.read()
        df_l = pd.concat([df_l, pd.DataFrame({'text': [content], 'label': ['0']})], ignore_index=True)
    
for file in os.listdir(ds_center_path): # Center
    path = os.path.join(ds_center_path, file)
    with open(path, 'r') as f:
        content = f.read()
        df_c = pd.concat([df_c, pd.DataFrame({'text': [content], 'label': ['1']})], ignore_index=True)

for file in os.listdir(ds_right_path): # Right
    path = os.path.join(ds_right_path, file)
    with open(path, 'r') as f:
        content = f.read()
        df_r = pd.concat([df_r, pd.DataFrame({'text': [content], 'label': ['2']})], ignore_index=True)

# Prints
df_l.shape, df_c.shape, df_r.shape

((7803, 2), (3996, 2), (5563, 2))

In [6]:
# Merge
df = pd.concat([df_c, df_l, df_r], axis=0)
df.info(), df['label'].value_counts(), df.shape

<class 'pandas.core.frame.DataFrame'>
Index: 17362 entries, 0 to 5562
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    17362 non-null  object
 1   label   17362 non-null  object
dtypes: object(2)
memory usage: 406.9+ KB


(None,
 label
 0    7803
 2    5563
 1    3996
 Name: count, dtype: int64,
 (17362, 2))

In [None]:
# Clean text
df['text'] = df['text'].str.strip()
df['text'] = df['text'].str.replace('\n', ' ')

df.head()

Unnamed: 0,word_count
count,17362.0
mean,964.337058
std,3212.467368
min,49.0
25%,487.0
50%,771.0
75%,1122.0
max,204273.0


In [12]:
# Sampling for max 3996
df = df.groupby('label').sample(n=3996,
                                random_state=42).reset_index(drop=True)

df['label'].value_counts(), df['label'].value_counts(normalize=True)

(label
 0    3996
 1    3996
 2    3996
 Name: count, dtype: int64,
 label
 0    0.333333
 1    0.333333
 2    0.333333
 Name: proportion, dtype: float64)

Split

In [13]:
# Stratifed Split
from sklearn.model_selection import train_test_split

# Split df by label
df_train, df_test = train_test_split(df,
                                     test_size=0.2,
                                     stratify=df['label'],
                                     random_state=42)

df_train.shape, df_test.shape

((9590, 4), (2398, 4))

In [14]:
df_train['label'].value_counts(normalize=True), df_test['label'].value_counts(normalize=True)

(label
 0    0.333368
 1    0.333368
 2    0.333264
 Name: proportion, dtype: float64,
 label
 2    0.333611
 0    0.333194
 1    0.333194
 Name: proportion, dtype: float64)

Save

In [15]:
# Save
train_path = os.path.join(save_data, 'train.csv')
test_path = os.path.join(save_data, 'test.csv')

df_train.to_csv(train_path, index=False)
df_test.to_csv(test_path, index=False)

if os.path.exists(train_path):
  print('Train saved.')
if os.path.exists(test_path):
  print('Test saved.')

Train saved.
Test saved.
