In [2]:
import pandas as pd
import numpy as np

In [3]:
# load data
train_data = pd.read_csv('../data/train.csv')
test_data = pd.read_csv('../data/test.csv')
test_labels = pd.read_csv('../data/test_labels.csv')

In [4]:
# data size
print(f"Total train samples: {len(train_data)}")
print(f"Total test samples: {len(test_data)}")
print(f"Total test labels: {len(test_labels)}")

Total train samples: 159571
Total test samples: 153164
Total test labels: 153164


In [5]:
# first rows
print(f"="*40 + " First rows " + "="*40)
print(f"First row of train data: \n{train_data.head(1)}")
print()
print(f"First row of test data: \n{test_data.head(1)}")
print()
print(f"First row of test labels: \n{test_labels.head(1)}")

# column names
print(f"="*40 + " Column names " + "="*40)
print(f"Train data column names: \n{train_data.columns}")
print()
print(f"Test data column names: \n{test_data.columns}")
print()
print(f"Test labels column names: \n{test_labels.columns}")

# missing values per column per dataset
print("train data:")
print(train_data.isnull().sum())
print()
print("test data:")
print(test_data.isnull().sum())
print()
print("test labels:")
print(test_labels.isnull().sum())

First row of train data: 
                 id                                       comment_text  toxic  \
0  0000997932d777bf  Explanation\nWhy the edits made under my usern...      0   

   severe_toxic  obscene  threat  insult  identity_hate  
0             0        0       0       0              0  

First row of test data: 
                 id                                       comment_text
0  00001cee341fdb12  Yo bitch Ja Rule is more succesful then you'll...

First row of test labels: 
                 id  toxic  severe_toxic  obscene  threat  insult  \
0  00001cee341fdb12     -1            -1       -1      -1      -1   

   identity_hate  
0             -1  
Train data column names: 
Index(['id', 'comment_text', 'toxic', 'severe_toxic', 'obscene', 'threat',
       'insult', 'identity_hate'],
      dtype='object')

Test data column names: 
Index(['id', 'comment_text'], dtype='object')

Test labels column names: 
Index(['id', 'toxic', 'severe_toxic', 'obscene', 'threat', 'insu

In [6]:
# for testing data, check how many rows have -1 values
label_cols = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

no_neg_1 = test_labels[~(test_labels[label_cols] == -1).any(axis=1)]

print(f"Total test samples: {len(test_labels)}")
print(f"Total test samples with no -1's: {len(no_neg_1)}")
print(f"Total test samples with -1's: {len(test_labels) - len(no_neg_1)}")



Total test samples: 153164
Total test samples with no -1's: 63978
Total test samples with -1's: 89186


In [7]:
# Create processed test data with no -1 labels
test_1 = pd.merge(no_neg_1, test_data, on='id', how='inner')

# Reorder columns to match train data
column_order = ['id', 'comment_text'] + label_cols
test_1 = test_1[column_order]

# Save to test_1.csv
test_1.to_csv('../data/test_1.csv', index=False)

print(f"Created test_1.csv with {len(test_1):,} samples")
print(f"First few rows:")
print(test_1.head())


Created test_1.csv with 63,978 samples
First few rows:
                 id                                       comment_text  toxic  \
0  0001ea8717f6de06  Thank you for understanding. I think very high...      0   
1  000247e83dcc1211                   :Dear god this site is horrible.      0   
2  0002f87b16116a7f  "::: Somebody will invariably try to add Relig...      0   
3  0003e1cccfd5a40a  " \n\n It says it right there that it IS a typ...      0   
4  00059ace3e3e9a53  " \n\n == Before adding a new product to the l...      0   

   severe_toxic  obscene  threat  insult  identity_hate  
0             0        0       0       0              0  
1             0        0       0       0              0  
2             0        0       0       0              0  
3             0        0       0       0              0  
4             0        0       0       0              0  
