# Clearning Data - Non-NLP

### Export the Data

In [34]:
%matplotlib inline
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

test_df = pd.read_csv('Data/test.csv', low_memory=False,
                     parse_dates=['project_submitted_datetime'])
train_df = pd.read_csv('Data/train.csv', low_memory=False, 
                       parse_dates=['project_submitted_datetime'])


dfs = [test_df, train_df]

### Check Testing Data

In [35]:
test_df.isnull().sum()

id                                                  0
teacher_id                                          0
teacher_prefix                                      1
school_state                                        0
project_submitted_datetime                          0
project_grade_category                              0
project_subject_categories                          0
project_subject_subcategories                       0
project_title                                       0
project_essay_1                                     0
project_essay_2                                     0
project_essay_3                                 75331
project_essay_4                                 75331
project_resource_summary                            0
teacher_number_of_previously_posted_projects        0
dtype: int64

In [36]:
train_df.isnull().sum()

id                                                   0
teacher_id                                           0
teacher_prefix                                       4
school_state                                         0
project_submitted_datetime                           0
project_grade_category                               0
project_subject_categories                           0
project_subject_subcategories                        0
project_title                                        0
project_essay_1                                      0
project_essay_2                                      0
project_essay_3                                 175706
project_essay_4                                 175706
project_resource_summary                             0
teacher_number_of_previously_posted_projects         0
project_is_approved                                  0
dtype: int64

### Handling Null Values

In [37]:
# Place marker for missing teaher prefixes, and projects 3 & 4
for df in dfs:
    df.teacher_prefix = df.teacher_prefix.fillna(value='No Prefix')
    df.project_essay_3 = df.project_essay_3.fillna(value='No Essay')
    df.project_essay_4 = df.project_essay_4.fillna(value='No Essay')

### Seperating Category

In [38]:
train_df.project_subject_categories = train_df.project_subject_categories.str.replace(' & ', ' ')
train_df.project_subject_categories = train_df.project_subject_categories.str.replace(',', '')
train_df.project_subject_categories = train_df.project_subject_categories.str.replace('The ', '')
train_df.project_subject_categories = train_df.project_subject_categories.apply(lambda x: x.split())

In [92]:
train_df.project_subject_categories.head(1)

8323    [Math, Science]
Name: project_subject_categories, dtype: object

In [124]:
dummy_row = pd.get_dummies(pd.DataFrame(data=[[
    'Math', 'Science']]))
dummy_row.columns = row

dummy_row = pd.DataFrame(dummy_row).reset_index(drop=True)
dummy_row[]

Unnamed: 0,Literacy,Language
0,1,1


In [134]:
def get_dummies_from_series_of_lists(series):
    # Creating main dummies output dataframe 
    subject_categories = list()
    for row in series :
        for value in row:
            if value not in subject_categories:
                subject_categories.append(value)
    dummies = pd.DataFrame(columns=subject_categories)
    dummies.sort_index(1)
    
    # Append series data to dummies dataframe
    for row in series:
        dummy_row = pd.get_dummies(pd.DataFrame(data=[row]))
        dummy_row.columns = row
        dummy_row.sort_index(1)
        print(dummy_row)
        print(dummies)
        dummies = dummies.append(dummy_row)
    dummies = dummies.fillna(value=0).reset_index(drop=True)
    return dummies

train_df_project_subject_categories = get_dummies_from_series_of_lists(train_df.project_subject_categories)

   Math  Science
0     1        1
Empty DataFrame
Columns: [Math, Science, Music, Arts, Warmth, Care, Hunger, Health, Sports, Literacy, Language]
Index: []
   Math  Science
0     1        1
  Arts Care Health Hunger Language Literacy Math Music Science Sports Warmth
0  NaN  NaN    NaN    NaN      NaN      NaN    1   NaN       1    NaN    NaN
   Math  Science
0     1        1
  Arts Care Health Hunger Language Literacy Math Music Science Sports Warmth
0  NaN  NaN    NaN    NaN      NaN      NaN    1   NaN       1    NaN    NaN
0  NaN  NaN    NaN    NaN      NaN      NaN    1   NaN       1    NaN    NaN
   Music  Arts
0      1     1
  Arts Care Health Hunger Language Literacy Math Music Science Sports Warmth
0  NaN  NaN    NaN    NaN      NaN      NaN    1   NaN       1    NaN    NaN
0  NaN  NaN    NaN    NaN      NaN      NaN    1   NaN       1    NaN    NaN
0  NaN  NaN    NaN    NaN      NaN      NaN    1   NaN       1    NaN    NaN
   Math  Science  Music  Arts
0     1        1      1

In [136]:
train_df_project_subject_categories

Unnamed: 0,Arts,Care,Health,Hunger,Language,Literacy,Math,Music,Science,Sports,Warmth
0,0,0,0,0,0,0,1,0,1,0,0
1,0,0,0,0,0,0,1,0,1,0,0
2,0,0,0,0,0,0,1,0,1,0,0
3,1,0,0,0,0,0,0,1,0,0,0
4,1,0,0,0,0,0,1,1,1,0,0
5,0,1,0,1,0,0,0,0,0,0,1
6,0,0,0,0,0,0,1,0,1,0,0
7,0,0,0,0,0,0,1,0,1,0,0
8,0,0,1,0,0,0,0,0,0,1,0
9,0,0,0,0,1,1,0,0,0,0,0


### Export the Data

In [None]:
train_df.to_csv('Data/cleaned_train_df.csv')
test_df.to_csv('Data/cleaned_test_df.csv')