In [1]:
#Import Libs
import pandas as pd
import numpy as np

# For ordered categorical data
from pandas.api.types import CategoricalDtype

import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# Read the data
df_nas = pd.read_csv("../nas_2014.csv")

### Exploratory Data Analysis

#### 1. Check the range of the marks

In [3]:
df_nas.filter(like='%', axis=1).describe()

Unnamed: 0,Maths %,Reading %,Science %,Social %
count,92469.0,93175.0,90956.0,89528.0
mean,32.259371,46.223004,36.416349,38.361449
std,16.089177,21.829832,14.906687,15.132517
min,0.0,0.0,0.0,0.0
25%,21.67,27.59,25.93,28.0
50%,27.78,43.33,33.93,36.0
75%,37.93,63.33,44.64,48.0
max,100.0,100.0,100.0,96.0


The average for Reading is maximum and Maths is minimum.

##### 1.1 How many students have 100 in maths

In [4]:
print (f"There are {df_nas[df_nas['Maths %'] == 100].shape[0]} students with 100% marks in Maths.")

There are 9 students with 100% marks in Maths.


##### 1.2 How many students have 100 in Reading

In [5]:
df_nas[df_nas['Reading %'] == 100].shape[0]

81

#### 2 By State
##### 2.1 Top 5 states with highest and lowest number of students

In [6]:
df_nas['State'].value_counts().head(5)

Maharashtra       8785
Kerala            8742
Tamil Nadu        8218
Gujarat           7889
Andhra Pradesh    7763
Name: State, dtype: int64

In [7]:
df_nas['State'].value_counts().tail(5)

Sikkim                  3577
Andaman & Nicobar       2510
Dadra & Nagar Haveli    2390
Nagaland                1919
Daman & Diu              483
Name: State, dtype: int64

##### 2.2 Top 5 states with highest Total % and lowest Total %

In [8]:
df_state = df_nas.groupby('State')['Maths %','Science %','Social %','Reading %'].mean()
df_state['Total %'] = df_state[['Maths %', 'Reading %', 'Science %', 'Social %']].mean(axis=1)

In [9]:
# Top 5 State - by  sort 
df_state.sort_values('Total %',ascending=False).head(5)

Unnamed: 0_level_0,Maths %,Science %,Social %,Reading %,Total %
State,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Daman & Diu,39.070363,48.567978,48.445183,58.417366,48.625223
Uttar Pradesh,46.249564,40.087269,47.424712,46.053547,44.953773
Dadra & Nagar Haveli,39.190759,45.36235,43.819593,46.789067,43.790442
Tripura,39.786913,43.287356,43.2044,43.883633,42.540576
Kerala,28.968597,38.932014,42.944913,58.496412,42.335484


In [10]:
# Bottom 5 State - by  sort 
df_state.sort_values('Total %',ascending=True).head(5)

Unnamed: 0_level_0,Maths %,Science %,Social %,Reading %,Total %
State,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Pondicherry,26.010226,30.576886,30.38715,40.093955,31.767054
Meghalaya,27.458645,32.435003,32.502283,39.642971,33.009725
Tamil Nadu,27.449596,31.780568,31.405126,43.151505,33.446698
Andhra Pradesh,28.661412,32.675898,33.765278,43.151303,34.563473
Himachal Pradesh,28.12095,32.825157,35.014599,44.994749,35.238864


#### 3 Check the difference in marks By Gender

In [11]:
df_gen = df_nas.groupby('Gender')['Maths %','Science %','Social %','Reading %'].mean()
df_gen['Total %'] = df_gen[['Maths %', 'Reading %', 'Science %', 'Social %']].mean(axis=1)
df_gen

Unnamed: 0_level_0,Maths %,Science %,Social %,Reading %,Total %
Gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Boy,32.169603,36.369447,38.094746,45.225684,37.96487
Girl,32.330895,36.459287,38.616089,47.16241,38.64217


In [12]:
# Get the difference in marks by gender 

df_gen2 = df_gen.T
df_gen2['Marks Diff'] = df_gen2['Girl'] - df_gen2['Boy']
df_gen2 = df_gen2.T
df_gen2

Unnamed: 0_level_0,Maths %,Science %,Social %,Reading %,Total %
Gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Boy,32.169603,36.369447,38.094746,45.225684,37.96487
Girl,32.330895,36.459287,38.616089,47.16241,38.64217
Marks Diff,0.161291,0.08984,0.521343,1.936726,0.6773


On an average, Girls score more  than boys

In [13]:
### Function to calculate the avearge marks by the factor and the difference between the factors

def nas_groupby_diff(df,groupby_col,diff1,diff2,markDiffFlag):
    df_t = df.groupby(groupby_col)['Maths %','Science %','Social %','Reading %'].mean()
    df_t['Total %'] = df_t[['Maths %', 'Reading %', 'Science %', 'Social %']].mean(axis=1)
    
    if markDiffFlag == 'Yes' :
        # Convert the cols to str type - from ordered cat - so as to insert a new column
        df_t.index = df_t.index.astype(str)

        # Transpose and get the diff
        df_t = df_t.T
        df_t['Marks Diff'] = df_t[diff1] - df_t[diff2]

        # Transpose it back
        df_t = df_t.T

    return df_t

#### 4 Get the difference in marks By Parents Education

In [14]:
df_nas['Father edu'].value_counts(dropna=False)

Primary           58588
Illiterate        45881
Secondary         44034
Not applicable    16784
Sr secondary      12856
Degree & above     6866
Name: Father edu, dtype: int64

In [15]:
# Filter 'Not Applicable'
df_nas = df_nas[df_nas['Father edu'] != 'Not applicable']
df_nas = df_nas[df_nas['Mother edu'] != 'Not applicable']

In [16]:
# Convert to orderd category
ordered_level = ['Illiterate', 'Primary', 'Secondary', 'Sr secondary','Degree & above']
ord_cat_type = CategoricalDtype(categories=ordered_level, ordered=True)
df_nas['Father edu'] = df_nas['Father edu'].astype(ord_cat_type)
df_nas['Mother edu'] = df_nas['Mother edu'].astype(ord_cat_type)

In [17]:
df_t = nas_groupby_diff(df_nas,'Father edu','Degree & above','Illiterate','No')
df_t.style.highlight_min(axis=0,color='hotpink').highlight_max(axis=0,color='greenyellow')

Unnamed: 0_level_0,Maths %,Science %,Social %,Reading %,Total %
Father edu,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Illiterate,31.4088,34.3795,36.8,41.1787,35.9417
Primary,32.3516,36.3996,38.2791,46.1875,38.3045
Secondary,32.3771,37.3067,39.3235,49.141,39.537
Sr secondary,33.777,38.7012,40.9368,53.3786,41.6984
Degree & above,37.8377,44.2371,44.6042,59.8803,46.6398


In [18]:
nas_groupby_diff(df_nas,'Father edu','Degree & above','Illiterate','Yes')

Unnamed: 0_level_0,Maths %,Science %,Social %,Reading %,Total %
Father edu,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Illiterate,31.408764,34.379484,36.799957,41.178664,35.941717
Primary,32.351572,36.39963,38.279091,46.187523,38.304454
Secondary,32.377078,37.306677,39.32348,49.140951,39.537047
Sr secondary,33.777039,38.701228,40.936847,53.378605,41.69843
Degree & above,37.837705,44.237098,44.604247,59.880302,46.639838
Marks Diff,6.428942,9.857614,7.80429,18.701638,10.698121


In [19]:
nas_groupby_diff(df_nas,'Mother edu','Degree & above','Illiterate','Yes')

Unnamed: 0_level_0,Maths %,Science %,Social %,Reading %,Total %
Mother edu,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Illiterate,32.249772,35.31711,37.660295,42.760053,36.996808
Primary,32.357862,36.331347,38.295381,46.552162,38.384188
Secondary,32.120475,37.607013,39.560079,50.947038,40.058651
Sr secondary,34.336969,40.524849,41.926885,56.420657,43.30234
Degree & above,36.590882,45.799305,45.478484,61.430497,47.324792
Marks Diff,4.341111,10.482195,7.818189,18.670444,10.327985


Observe a difference in ~18% in Reading and ~10% between Illetrate and 'Degree and Above', for both Mother and Father's education level

##### 4.1 Get the number of students by Educatin level and Gender

In [20]:
df_nas.pivot_table(index='Gender', columns='Father edu', values='STUID',aggfunc='count')

Father edu,Illiterate,Primary,Secondary,Sr secondary,Degree & above
Gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Boy,20319,25554,18584,5396,2796
Girl,18975,27644,21592,6591,3528


In [21]:
df_nas.pivot_table(index='Gender', columns='Father edu', values='STUID',aggfunc='count')

Father edu,Illiterate,Primary,Secondary,Sr secondary,Degree & above
Gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Boy,20319,25554,18584,5396,2796
Girl,18975,27644,21592,6591,3528


#### 5 How does the factor 'Play Games' affects the marks

In [22]:
df_nas['Play games'].value_counts(dropna=False)

Every day       94715
Once a week     30250
Never           12057
Once a month    12007
NaN              2115
Name: Play games, dtype: int64

In [23]:
ordered_level = ['Never', 'Once a month', 'Once a week', 'Every day']
ord_cat_type = CategoricalDtype(categories=ordered_level, ordered=True)
df_nas['Play games'] = df_nas['Play games'].astype(ord_cat_type)

df_t = nas_groupby_diff(df_nas,'Play games','Never','Every day','No')
df_t.style.highlight_min(axis=0,color='violet').highlight_max(axis=0,color='turquoise')

Unnamed: 0_level_0,Maths %,Science %,Social %,Reading %,Total %
Play games,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Never,29.9518,33.0958,36.3266,43.7339,35.777
Once a month,31.4908,35.8206,38.6801,48.5623,38.6385
Once a week,32.2932,36.9829,38.7255,47.9523,38.9885
Every day,33.036,37.1543,39.0377,46.8829,39.0277


Playing everyday increases the marks !

In [24]:
# Get the difference
nas_groupby_diff(df_nas,'Play games','Every day','Never','Yes')

Unnamed: 0_level_0,Maths %,Science %,Social %,Reading %,Total %
Play games,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Never,29.951786,33.095756,36.326595,43.733896,35.777008
Once a month,31.490789,35.820615,38.680128,48.562293,38.638456
Once a week,32.293202,36.982858,38.72555,47.952296,38.988476
Every day,33.035967,37.154341,39.037714,46.882871,39.027723
Marks Diff,3.084181,4.058586,2.711119,3.148975,3.250715


#### 6 Get the effect of all factors with  difference in the minimum and maximum marks per factor

In [25]:
# Read the data
df_nas_factor = pd.read_csv("../nas_byFactor_2014.csv")

In [26]:
cm = sns.light_palette("blue", as_cmap=True)

df_nas_factor.style.background_gradient(cmap=cm)

Unnamed: 0,Factor,Total %,Maths %,Science %,Social %,Reading %
0,Gender,0.6773,0.161291,0.0898397,0.521343,1.93673
1,Age,4.18772,3.13436,3.34891,2.92866,8.0264
2,Siblings,1.4899,1.31767,1.68746,0.830884,5.23315
3,Handicap,3.89546,2.49019,3.34482,3.12095,7.12065
4,Father edu,10.8049,7.14773,9.87862,7.85857,18.8429
5,Mother edu,10.4208,5.00375,10.3277,8.44497,18.3774
6,Father occupation,10.4592,8.72121,10.3449,8.47401,17.4002
7,Mother occupation,8.66523,5.74344,7.28675,7.9215,14.8536
8,Below poverty,3.29351,2.7978,2.55696,3.34513,5.75762
9,Use calculator,2.68181,0.921162,2.71308,2.05553,5.03748


In [27]:
# Make Factor as the index column
df_nas_factor.set_index('Factor', inplace=True)

In [28]:
### Get the factors having the largets effect on the Total %
df_nas_factor.sort_values('Total %',ascending=False).head(5)

Unnamed: 0_level_0,Total %,Maths %,Science %,Social %,Reading %
Factor,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Father edu,10.804898,7.147734,9.878618,7.858566,18.842922
Father occupation,10.459215,8.721214,10.344859,8.474012,17.400177
Mother edu,10.420815,5.00375,10.327734,8.444972,18.377394
Mother occupation,8.665225,5.743442,7.286746,7.921499,14.853603
Help in household,5.154852,4.64136,4.984875,6.489467,5.674657


In [29]:
### Get the factors having the least effect on the Total %
df_nas_factor.sort_values('Total %',ascending=True).head(5)

Unnamed: 0_level_0,Total %,Maths %,Science %,Social %,Reading %
Factor,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Gender,0.6773,0.161291,0.08984,0.521343,1.936726
Distance,0.767315,2.171856,0.894036,0.878486,1.238056
Private tuition,1.229088,1.268174,1.16842,0.330589,2.14917
Siblings,1.4899,1.317671,1.687458,0.830884,5.233149
Use Internet,1.610506,0.138483,2.064682,0.816788,3.699035


In [30]:
### Get the factors having the largets effect on the Maths %
df_nas_factor.sort_values('Maths %',ascending=False).head(5)

Unnamed: 0_level_0,Total %,Maths %,Science %,Social %,Reading %
Factor,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Father occupation,10.459215,8.721214,10.344859,8.474012,17.400177
Father edu,10.804898,7.147734,9.878618,7.858566,18.842922
Computer use,3.338935,6.50102,3.174619,4.056019,5.861707
Mother occupation,8.665225,5.743442,7.286746,7.921499,14.853603
Mother edu,10.420815,5.00375,10.327734,8.444972,18.377394


In [31]:
### Get the factors having the largets effect on the Science %
df_nas_factor.sort_values('Science %',ascending=False).head(5)

Unnamed: 0_level_0,Total %,Maths %,Science %,Social %,Reading %
Factor,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Father occupation,10.459215,8.721214,10.344859,8.474012,17.400177
Mother edu,10.420815,5.00375,10.327734,8.444972,18.377394
Father edu,10.804898,7.147734,9.878618,7.858566,18.842922
Mother occupation,8.665225,5.743442,7.286746,7.921499,14.853603
Help in household,5.154852,4.64136,4.984875,6.489467,5.674657


In [32]:
### Get the factors having the largets effect on the Social %
df_nas_factor.nlargest(5,'Social %').index.values

array(['Father occupation', 'Mother edu', 'Mother occupation',
       'Father edu', 'Help in household'], dtype=object)

#### 7 Check the effect of Parents Education and Gender on marks

In [33]:
### Function to calculate the groupby marks average
def nas_2_groupby_marks(df,groupby_col1,groupby_col2):
    df_t = df.groupby([groupby_col1,groupby_col2])['Maths %','Science %','Social %','Reading %'].mean()
    df_t['Total %'] = df_t[['Maths %', 'Reading %', 'Science %', 'Social %']].mean(axis=1)
    
    return df_t

In [34]:
df_t = nas_2_groupby_marks(df_nas,'Mother edu','Gender')
df_t.style.highlight_min(axis=0,color='lightcoral').highlight_max(axis=0,color='lime')

Unnamed: 0_level_0,Unnamed: 1_level_0,Maths %,Science %,Social %,Reading %,Total %
Mother edu,Gender,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Illiterate,Boy,32.1767,35.5569,37.82,42.7291,37.0707
Illiterate,Girl,32.2935,35.0565,37.5048,42.8119,36.9167
Primary,Boy,32.3862,36.3695,37.9705,45.5848,38.0778
Primary,Girl,32.319,36.2983,38.5884,47.4369,38.6606
Secondary,Boy,32.0964,37.2656,38.8374,49.1771,39.3441
Secondary,Girl,32.1447,37.8954,40.1786,52.4522,40.6677
Sr secondary,Boy,33.686,40.0768,41.0214,53.4701,42.0636
Sr secondary,Girl,34.8745,40.8956,42.6533,58.7651,44.2971
Degree & above,Boy,35.9543,44.8605,44.0791,57.9986,45.7231
Degree & above,Girl,37.1414,46.5797,46.6055,64.2098,48.6341


In [35]:
df_t = nas_2_groupby_marks(df_nas,'Father edu','Gender')
df_t.style.highlight_min(axis=0,color='violet').highlight_max(axis=0,color='springgreen')

Unnamed: 0_level_0,Unnamed: 1_level_0,Maths %,Science %,Social %,Reading %,Total %
Father edu,Gender,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Illiterate,Boy,31.2846,34.461,36.8002,40.9506,35.8741
Illiterate,Girl,31.5384,34.2967,36.8098,41.4503,36.0238
Primary,Boy,32.3357,36.6408,38.2085,45.5242,38.1773
Primary,Girl,32.3237,36.1598,38.3496,46.794,38.4068
Secondary,Boy,32.4362,36.9912,38.8304,47.6968,38.9886
Secondary,Girl,32.3251,37.5767,39.7513,50.4086,40.0154
Sr secondary,Boy,33.6962,38.772,40.3974,51.5833,41.1122
Sr secondary,Girl,33.8566,38.642,41.3736,54.8576,42.1825
Degree & above,Boy,37.4782,44.0739,43.7335,58.2042,45.8725
Degree & above,Girl,38.1698,44.3645,45.286,61.2811,47.2754


#### 8 What is the difference in  marks by the factors Play Games and Read other books

In [36]:
df_t = nas_2_groupby_marks(df_nas,'Play games','Read other books')
df_t.style.highlight_min(axis=0,color='pink').highlight_max(axis=0,color='greenyellow')

Unnamed: 0_level_0,Unnamed: 1_level_0,Maths %,Science %,Social %,Reading %,Total %
Play games,Read other books,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Never,No,30.2674,32.1738,35.4719,42.2736,35.0467
Never,Yes,30.3159,33.9725,37.2481,45.4952,36.7579
Once a month,No,30.4018,33.588,37.2038,43.3051,36.1247
Once a month,Yes,32.0878,36.9058,39.5344,50.9101,39.8595
Once a week,No,31.3168,34.9654,36.5619,43.2745,36.5296
Once a week,Yes,32.8827,37.8553,39.5296,49.9049,40.0431
Every day,No,32.8966,35.5168,37.2836,41.9829,36.92
Every day,Yes,33.368,38.0766,39.9751,49.0802,40.125


#### 9 Which Subject do students score most/least

In [37]:
df_nas_marks = df_nas.filter(like='%').copy()

In [38]:
# Get the max marks columns
df_nas_marks['highest_marks'] = df_nas_marks.idxmax(axis='columns')
# Get the min marks columns
df_nas_marks['lowest_marks'] = df_nas_marks.drop('highest_marks', axis='columns').idxmin(axis='columns')

In [39]:
df_nas_marks['highest_marks'].value_counts(normalize=True)

Reading %    0.369876
Social %     0.263707
Science %    0.221958
Maths %      0.144458
Name: highest_marks, dtype: float64

In [40]:
df_nas_marks['lowest_marks'].value_counts(normalize=True)

Maths %      0.369198
Science %    0.270389
Social %     0.213560
Reading %    0.146853
Name: lowest_marks, dtype: float64

##### 9.1 For students with highest score in Maths what is the 2nd highest scoring subject

In [41]:
pd.crosstab(df_nas_marks['highest_marks'],df_nas_marks['lowest_marks'],normalize=1)

lowest_marks,Maths %,Reading %,Science %,Social %
highest_marks,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Maths %,0.058561,0.648681,0.066312,0.045172
Reading %,0.770093,0.126807,0.111624,0.172109
Science %,0.096229,0.093916,0.076774,0.711182
Social %,0.075117,0.130595,0.745289,0.071537


In [42]:
#If highest in Maths then  what is the 2nd highest 
(df_nas_marks[df_nas_marks['highest_marks'] == 'Maths %']
 .drop(['Maths %','highest_marks','lowest_marks'], axis='columns')
 .idxmax(axis='columns')
 .value_counts(normalize=True)
)

Reading %    0.775662
Science %    0.145012
Social %     0.079326
dtype: float64

In [43]:
#If highest in Reading then  what is the 2nd highest 
(df_nas_marks[df_nas_marks['highest_marks'] == 'Reading %']
 .drop(['Reading %','highest_marks','lowest_marks'], axis='columns')
 .idxmax(axis='columns')
 .value_counts(normalize=True)
)

Maths %      0.770806
Social %     0.131546
Science %    0.097648
dtype: float64

In [44]:
#If highest in Science then  what is the 2nd highest 
(df_nas_marks[df_nas_marks['highest_marks'] == 'Science %']
 .drop(['Science %','highest_marks','lowest_marks'], axis='columns')
 .idxmax(axis='columns')
 .value_counts(normalize=True)
)

Social %     0.762414
Maths %      0.164072
Reading %    0.073514
dtype: float64

In [45]:
#If highest in Social then  what is the 2nd highest 
(df_nas_marks[df_nas_marks['highest_marks'] == 'Social %']
 .drop(['Social %','highest_marks','lowest_marks'], axis='columns')
 .idxmax(axis='columns')
 .value_counts(normalize=True)
)

Science %    0.815347
Maths %      0.103823
Reading %    0.080830
dtype: float64

Observe some affinity for marks between Social and Science and between Maths and Reading

##### 9.2 Check the correlation 

In [46]:
df_nas.filter(like='%').corr()

Unnamed: 0,Maths %,Reading %,Science %,Social %
Maths %,1.0,0.349076,0.504585,0.521961
Reading %,0.349076,1.0,0.432412,0.463202
Science %,0.504585,0.432412,1.0,0.560325
Social %,0.521961,0.463202,0.560325,1.0


In [47]:
#If lowest is Maths, what is the highest 
(df_nas_marks[df_nas_marks['lowest_marks'] == 'Maths %']
 .drop(['Maths %','highest_marks','lowest_marks'], axis='columns')
 .idxmax(axis='columns')
 .value_counts(normalize=True)
)

Reading %    0.819173
Science %    0.101615
Social %     0.079212
dtype: float64

In [48]:
#If lowest is Maths, what is next lowest
(df_nas_marks[df_nas_marks['lowest_marks'] == 'Maths %']
 .drop(['Maths %','highest_marks','lowest_marks'], axis='columns')
 .idxmin(axis='columns')
 .value_counts(normalize=True)
)

Reading %    0.77838
Science %    0.12535
Social %     0.09627
dtype: float64

#### 10 Interactive , calculate for all cobination of Factors

In [49]:
import ipywidgets as widgets
from ipywidgets import interact_manual

##### 10.1 Interactive , Single Factor

In [50]:
@interact_manual
def nas_grpby_interactive(col=list(df_nas.drop('State',axis=1).select_dtypes(include=[object]).columns.values)):
    
    # Remove nan
    df = df_nas.dropna(subset = [col])
    df_t = nas_groupby_diff(df,col,'ignore','ignore','No')                      
    #print(df_t.style.highlight_min(axis=0).apply(highlight_max))
    print(df_t)

interactive(children=(Dropdown(description='col', options=('Use computer', 'Gender', 'Age', 'Siblings', 'Handi…

##### 10.2 Interactive , by two factors

In [51]:
@interact_manual
def nas_grpby_interactive(col1=list(df_nas.drop('State',axis=1).select_dtypes(include=[object]).columns.values), 
                          col2=list(df_nas.drop('State',axis=1).select_dtypes(include=[object]).columns.values)[1:]) :
    
    # Remove nan
    df = df_nas.dropna(subset = [col1, col2])
    df_t = nas_2_groupby_marks(df,col1,col2)                      
    #print(df_t.style.highlight_min(axis=0).apply(highlight_max))
    print(df_t)

interactive(children=(Dropdown(description='col1', options=('Use computer', 'Gender', 'Age', 'Siblings', 'Hand…