# Clearning Data - Non-NLP

### Export the Data

In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from tqdm import *

test_df = pd.read_csv('Data/test.csv', low_memory=False,
                     parse_dates=['project_submitted_datetime'])
train_df = pd.read_csv('Data/train.csv', low_memory=False, 
                       parse_dates=['project_submitted_datetime'])

dfs = [test_df, train_df]

### Check Testing Data

In [2]:
test_df.isnull().sum()

id                                                  0
teacher_id                                          0
teacher_prefix                                      1
school_state                                        0
project_submitted_datetime                          0
project_grade_category                              0
project_subject_categories                          0
project_subject_subcategories                       0
project_title                                       0
project_essay_1                                     0
project_essay_2                                     0
project_essay_3                                 75331
project_essay_4                                 75331
project_resource_summary                            0
teacher_number_of_previously_posted_projects        0
dtype: int64

In [3]:
train_df.isnull().sum()

id                                                   0
teacher_id                                           0
teacher_prefix                                       4
school_state                                         0
project_submitted_datetime                           0
project_grade_category                               0
project_subject_categories                           0
project_subject_subcategories                        0
project_title                                        0
project_essay_1                                      0
project_essay_2                                      0
project_essay_3                                 175706
project_essay_4                                 175706
project_resource_summary                             0
teacher_number_of_previously_posted_projects         0
project_is_approved                                  0
dtype: int64

### Handling Null Values

In [4]:
# Place marker for missing teaher prefixes, and projects 3 & 4
for df in dfs:
    df.teacher_prefix = df.teacher_prefix.fillna(value='No Prefix')
    df.project_essay_3 = df.project_essay_3.fillna(value='No Essay')
    df.project_essay_4 = df.project_essay_4.fillna(value='No Essay')

### Seperating Categories into Dummy Features

In [5]:
def clean_categories(series):
    series = series.str.replace(' & ', ' ')
    series = series.str.replace(',', '')
    series = series.str.replace('The ', '')
    series = series.apply(lambda x: x.split())
    series = series.apply(lambda x: list(np.unique(x)))
    return series

def get_dummies_from_series_of_lists(series, column_prefix):
    # Creating main dummies output dataframe 
    subject_categories = list()
    for row in tqdm(series):
        for value in row:
            if value not in subject_categories:
                subject_categories.append(value)
    dummies = pd.DataFrame(columns=subject_categories)
    dummies = dummies.sort_index(1)
    
    # Append series data to dummies dataframe
    for row in tqdm(series):
        dummy_row = pd.get_dummies(pd.DataFrame(data=[row]))
        for i, value in enumerate(row):
            row[i] = column_prefix+row[i]
        dummy_row.columns = row
        dummy_row = dummy_row.sort_index(1)
        dummies = dummies.append(dummy_row)
    dummies = dummies.fillna(value=0).reset_index(drop=True)
    return dummies

In [6]:
len(train_df.columns)

16

In [7]:
train_df.project_subject_categories = clean_categories(train_df.project_subject_categories)
train_df_categories_dummies = get_dummies_from_series_of_lists(train_df.project_subject_categories,
                                                              'cat_')
train_df = pd.concat([train_df,train_df_categories_dummies],axis=1)

test_df.project_subject_categories = clean_categories(test_df.project_subject_categories)
test_df_categories_dummies = get_dummies_from_series_of_lists(test_df.project_subject_categories,
                                                             'cat_')
test_df = pd.concat([test_df,test_df_categories_dummies],axis=1)

100%|██████████| 182080/182080 [00:00<00:00, 1302200.41it/s]
100%|██████████| 182080/182080 [3:54:16<00:00,  7.39it/s]
100%|██████████| 78035/78035 [00:00<00:00, 1306596.86it/s]
100%|██████████| 78035/78035 [46:16<00:00, 28.11it/s]


### Seperating Subcategories into Dummy Features

In [8]:
train_df.project_subject_subcategories = clean_categories(train_df.project_subject_subcategories)
train_df_subcategories_dummies = get_dummies_from_series_of_lists(train_df.project_subject_subcategories,
                                                                 'subcat_')
train_df = pd.concat([train_df,train_df_subcategories_dummies],axis=1)

test_df.project_subject_subcategories = clean_categories(test_df.project_subject_subcategories)
test_df_subcategories_dummies = get_dummies_from_series_of_lists(test_df.project_subject_subcategories,
                                                                'subcat_')
test_df = pd.concat([test_df,test_df_subcategories_dummies],axis=1)

100%|██████████| 182080/182080 [00:00<00:00, 937889.30it/s]
100%|██████████| 182080/182080 [9:47:22<00:00,  2.77it/s]
100%|██████████| 78035/78035 [00:00<00:00, 940624.47it/s]
100%|██████████| 78035/78035 [1:52:17<00:00,  5.68it/s]


### Export the Data

In [10]:
test_df.columns = test_df.columns.str.lower()
train_df.columns = train_df.columns.str.lower()
train_df.to_csv('Data/cleaned_train_df.csv', index=False)
test_df.to_csv('Data/cleaned_test_df.csv', index=False)