In [1]:
from IPython.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [3]:
survey_data = pd.read_csv('data_cleaned.csv')
print(survey_data.shape)

survey_data.head()

(491, 12)


Unnamed: 0,gender,mother_tongue,mother_tongue_rating,no_of_fluent_languages,state,resided_in_the_same_city_from_birth,teaching_language_preference,notes_language_preference,questionpaper_language_preference,exam_in_regional_language,teaching_in_regional_language,knowledge_improvement
0,Female,Kannada,5.0,4,Karnataka,Yes,Hybrid language,Hybrid language,Regional language,Yes,Yes,Yes
1,Female,Kannada,5.0,3,Karnataka,Yes,English language,English language,English language,Yes,No,No
2,Male,Kannada,5.0,4,Karnataka,Yes,Hybrid language,Hybrid language,Hybrid language,Yes,Maybe,Yes
3,Female,Kannada,4.0,3,Karnataka,No,English language,English language,English language,Maybe,Maybe,Yes
4,Female,Kannada,5.0,5,Karnataka,Yes,Regional language,Hybrid language,Hybrid language,Yes,Yes,Yes


# The Variable Types

In [4]:
numerical_features = [feature for feature in survey_data.columns if survey_data[feature].dtype != 'O']
print("Total number of numerical features: ", len(numerical_features))
print(numerical_features)

Total number of numerical features:  2
['mother_tongue_rating', 'no_of_fluent_languages']


In [5]:
categorical_features = [feature for feature in survey_data.columns if feature not in numerical_features and feature != 'knowledge_improvement']
print("Total number of numerical features: ", len(categorical_features))
print(categorical_features)

Total number of numerical features:  9
['gender', 'mother_tongue', 'state', 'resided_in_the_same_city_from_birth', 'teaching_language_preference', 'notes_language_preference', 'questionpaper_language_preference', 'exam_in_regional_language', 'teaching_in_regional_language']


# Splitting train and test data

In [6]:
X_train, X_test, y_train, y_test = train_test_split(survey_data.drop(['knowledge_improvement'], axis = 1),
                                                                    survey_data['knowledge_improvement'],
                                                                    test_size = 0.4, random_state = 12)

# Categorical Encoding

In [7]:
from feature_engine.encoding import OneHotEncoder

destination_encoder = OneHotEncoder(variables = categorical_features, 
                                    drop_last = True)
destination_encoder.fit(X_train)

X_train = destination_encoder.transform(X_train)
X_test = destination_encoder.transform(X_test)

In [8]:
X_train.head()

Unnamed: 0,mother_tongue_rating,no_of_fluent_languages,gender_Male,gender_Female,mother_tongue_Kannada,mother_tongue_Tamil,mother_tongue_Telugu,mother_tongue_Hindi,mother_tongue_Bengali,mother_tongue_Tulu,...,teaching_language_preference_Hybrid language,teaching_language_preference_English language,notes_language_preference_English language,notes_language_preference_Regional language,questionpaper_language_preference_Hybrid language,questionpaper_language_preference_English language,exam_in_regional_language_No,exam_in_regional_language_Yes,teaching_in_regional_language_Maybe,teaching_in_regional_language_Yes
8,5.0,3,1,0,1,0,0,0,0,0,...,1,0,1,0,1,0,1,0,1,0
161,5.0,2,0,1,1,0,0,0,0,0,...,0,1,1,0,0,1,1,0,1,0
483,4.0,2,1,0,0,1,0,0,0,0,...,0,0,0,1,0,0,0,1,0,1
356,5.0,3,1,0,1,0,0,0,0,0,...,1,0,0,0,1,0,0,1,1,0
250,5.0,3,1,0,0,0,1,0,0,0,...,1,0,0,0,1,0,0,0,1,0


In [9]:
# Saving X_train and X_test
X_train.to_csv("X_train.csv", index = False)
X_test.to_csv("X_test.csv", index = False)

In [10]:
def encode_target(value):
    encode_dict = {'Yes': 1, 'No': 0, 'Maybe': 2}
    
    return encode_dict[value]

In [11]:
y_train = y_train.apply(encode_target)
y_test = y_test.apply(encode_target)

In [12]:
# Saving y_train and y_test
y_train.to_csv("y_train.csv", index = False)
y_test.to_csv("y_test.csv", index = False)