In [20]:
# jai chung
# classification

import os
import numpy as np
import pandas as pd
from collections import Counter
from autocorrect import Speller
import nltk

# set static variables
dir_str = os.getcwd()
file_n = "/test_data.csv"

# colors
custom_palette = ["#e41a1c", "#377eb8", "#4daf4a", "#984ea3", "#ff7f00", "#ffff33"]

In [21]:
# read data
data_str = dir_str + file_n
df = pd.read_csv(data_str, header = 0)

# numerical features & check if NaNs exist
types = df.dtypes
nums_i = (types == int)|(types == float)
nums = df.loc[:,nums_i]
print(nums.describe())
print("Number of NAs for numerical values:")
print(pd.isnull(nums).sum())

# non-numerical features & check if NaNs exist
n_nums = df.loc[:,~nums_i]
n_nums_cols = list(n_nums.columns)
print(n_nums.describe())
print("Number of NAs for non-numerical values:")
print(pd.isnull(n_nums).sum())

               age  hours_studied
count  5000.000000    3018.000000
mean     25.873400       4.964076
std       7.349978       3.203037
min      18.000000       0.139203
25%      20.000000       2.555823
50%      24.000000       4.247235
75%      30.000000       6.660737
max      50.000000      24.259314
Number of NAs for numerical values:
age                 0
hours_studied    1982
dtype: int64
         sex    lang country  class test_prep   pass   notes  \
count   5000    5000    5000   5000      5000   5000     766   
unique     2       6      10      2         2      2     377   
top     Male  Korean   Korea  False     False  False  #NAME?   
freq    3719    4022    4022   3518      3986   2665       9   

       goals/hopes/concerns  
count                    84  
unique                   79  
top            nothing much  
freq                      3  
Number of NAs for non-numerical values:
sex                        0
lang                       0
country                    0
cla

Numerical variables: It is shown that 40% of the data for hours_studied does not have an input value - we assume that a lack of input means that the student has not invested any time whatsoever.

In [22]:
df['class']

0        True
1       False
2       False
3       False
4       False
        ...  
4995    False
4996    False
4997    False
4998     True
4999    False
Name: class, Length: 5000, dtype: bool

In [23]:
# replace NaN of hours_studied with 0
df['hours_studied'].replace(np.nan, 0, inplace = True)

# combine class and test prep variable for eda purposes
df['class'] = np.where((df['class'] == True)&(df['test_prep'] == True), 'Both', 
                       np.where(df['class'] == True, 'Only class',
                       np.where(df['test_prep'] == True, 'Only test', 'None')))

# combine class and test prep variable for eda purposes
df['class2'] = np.where((df['class'] == True)|(df['test_prep'] == True), 
                        'Took a course/courses', 'Did not take a course')

In [24]:
Counter(df['notes'])

Counter({nan: 4234,
         'CON': 6,
         '@{[system "touch /tmp/blns.fail"]}': 2,
         '`⁄€‹›ﬁﬂ‡°·‚—±': 5,
         '᚛ᚄᚓᚐᚋᚒᚄ\u1680ᚑᚄᚂᚑᚏᚅ᚜': 3,
         'Œ„´‰ˇÁ¨ˆØ∏”’': 1,
         '😍': 2,
         'ABC<div style="x:exp\\x00ression(javascript:alert(1)">DEF': 4,
         '\xad\u0600\u0601\u0602\u0603\u0604\u0605\u061c\u06dd\u070f\u180e\u200b\u200c\u200d\u200e\u200f\u202a\u202b\u202c\u202d\u202e\u2060\u2061\u2062\u2063\u2064\u2066\u2067\u2068\u2069\u206a\u206b\u206c\u206d\u206e\u206f\ufeff\ufff9\ufffa\ufffb\U000110bd\U0001bca0\U0001bca1\U0001bca2\U0001bca3\U0001d173\U0001d174\U0001d175\U0001d176\U0001d177\U0001d178\U0001d179\U0001d17a\U000e0001\U000e0020\U000e0021\U000e0022\U000e0023\U000e0024\U000e0025\U000e0026\U000e0027\U000e0028\U000e0029\U000e002a\U000e002b\U000e002c\U000e002d\U000e002e\U000e002f\U000e0030\U000e0031\U000e0032\U000e0033\U000e0034\U000e0035\U000e0036\U000e0037\U000e0038\U000e0039\U000e003a\U000e003b\U000e003c\U000e003d\U000e003e\U000e003f\U000e0040\U000e0041

Entry data in the column of notes was difficult to understand - so notes column will be disregarded for now. Same with names.

In [25]:
Counter(df['goals/hopes/concerns'])

Counter({'pass the test': 1,
         nan: 4916,
         '*goal: improve progrmaming skill, *concern: no time to study': 1,
         'improve enlish': 1,
         'english and programming language': 1,
         'my concer: not good at english': 1,
         'goal': 1,
         'My goal is to improve my technical skills.': 1,
         'It is unclear how to utilize skills after returning': 1,
         'like to learn more technical skill': 1,
         'learn Korean': 1,
         'nothing much': 3,
         'speak better englihs, hope to pass the exam': 1,
         'improve test score': 1,
         '■Goal pass the eglish exam ■Concern Lack of implementation experience and skill': 1,
         'concern about understanding Korean': 1,
         'Get used to english': 1,
         'Improving programming skills and practicing Scrum': 1,
         "I want to communicate with my colleagues from abroad in English, but I can't speak it well.": 1,
         'know more Korean cultural': 1,
         'get 

Column 'goals/hopes/concerns' on the other hand provides us more data compared to 'notes' but some rows have a few spelling mistakes

In [26]:
from autocorrect import spell
df['goals/hopes/concerns'] = df['goals/hopes/concerns'].replace(np.nan, '')
spell = Speller(lang = 'en')
df['goals/hopes/concerns'] = [' '.join([spell(i) for i in x.split()]) for x in df['goals/hopes/concerns']]
Counter(df['goals/hopes/concerns'])

Counter({'pass the test': 1,
         '': 4916,
         '*goal: improve programming skill, *concern: no time to study': 1,
         'improve english': 2,
         'english and programming language': 1,
         'my cancer: not good at english': 1,
         'goal': 1,
         'My goal is to improve my technical skills.': 1,
         'It is unclear how to utilize skills after returning': 1,
         'like to learn more technical skill': 1,
         'learn Korean': 1,
         'nothing much': 3,
         'speak better english, hope to pass the exam': 1,
         'improve test score': 1,
         '■Goal pass the english exam ■Concern Lack of implementation experience and skill': 1,
         'concern about understanding Korean': 1,
         'Get used to english': 1,
         'Improving programming skills and practicing Scrum': 1,
         "I want to communicate with my colleagues from abroad in English, but I can't speak it well.": 1,
         'know more Korean cultural': 1,
         'get

In [27]:
nltk.download(['punkt','stopwords'])
txt = df['goals/hopes/concerns'].str.lower().str.replace(r'\|', ' ').str.cat(sep=' ')

words = nltk.tokenize.word_tokenize(txt)
word_dist = nltk.FreqDist(words)
stopwords = nltk.corpus.stopwords.words('english')
words_except_stop_dist = nltk.FreqDist(w for w in words if w not in stopwords)

print('All frequencies, excluding STOPWORDS:')
print('=' * 60)
rslt = pd.DataFrame(words_except_stop_dist.most_common(20), columns=['Word', 'Frequency']).set_index('Word')
print(rslt)

[nltk_data] Downloading package punkt to /Users/jaichung/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/jaichung/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


All frequencies, excluding STOPWORDS:
           Frequency
Word                
english           22
learn             14
skill             11
:                 10
time               9
korean             9
get                8
scrum              8
study              7
technical          7
pass               6
goal               6
improve            6
.                  6
,                  5
concern            5
good               5
skills             5
exam               5
test               4


  txt = df['goals/hopes/concerns'].str.lower().str.replace(r'\|', ' ').str.cat(sep=' ')


We can see that the keyword 'english' appears quite often among the input data of the column 'goals/hopes/concerns'

In [28]:
%store df
%store custom_palette
%store rslt

Stored 'df' (DataFrame)
Stored 'custom_palette' (list)
Stored 'rslt' (DataFrame)


In [29]:
df

Unnamed: 0,sex,lang,country,age,hours_studied,class,test_prep,pass,notes,goals/hopes/concerns,class2
0,Male,Korean,Korea,20,10.578594,Only class,False,True,,pass the test,Did not take a course
1,Male,Korean,Korea,22,7.814520,,False,True,,,Did not take a course
2,Male,Korean,Korea,27,4.213682,,False,True,,"*goal: improve programming skill, *concern: no...",Did not take a course
3,Male,Korean,Korea,24,0.000000,,False,True,,,Did not take a course
4,Female,Korean,Korea,21,3.248907,,False,False,,,Did not take a course
...,...,...,...,...,...,...,...,...,...,...,...
4995,Male,Korean,Korea,29,3.291721,,False,True,,,Did not take a course
4996,Female,Korean,Korea,22,1.590491,,False,False,,,Did not take a course
4997,Male,English,UK,25,5.029988,,False,False,,learn business Korean,Did not take a course
4998,Male,Korean,Korea,22,1.730570,Only class,False,True,,,Did not take a course
