In [7]:
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import csv 

## Subtasks for Data Set 2 (Adult Data Set):


In [8]:
# Reading dataset and printing missing value information

df = pd.read_csv('./Adult Data Set/adult.data', index_col = False, header=None, names=['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country'])
df.replace(' ?', np.nan, inplace=True)
missing_values_table(df)

# To handle null values (https://stats.stackexchange.com/questions/228622/handling-null-value-in-logistic-regression)


# One hot encoding using pandas get_dummies and then dropping 
# one category for each categorial feature to preserve linear dependency (https://datascience.stackexchange.com/questions/27957/why-do-we-need-to-discard-one-dummy-variable/27993#27993)

df['workclass'] = df['workclass'].astype('category')
df['education'] = df['education'].astype('category')
df['marital-status'] = df['marital-status'].astype('category')
df['occupation'] = df['occupation'].astype('category')
df['relationship'] = df['relationship'].astype('category')
df['race'] = df['race'].astype('category')
df['sex'] = df['sex'].astype('category')
df['native-country'] = df['native-country'].astype('category')
df = pd.get_dummies(df , drop_first= True)

df.head(50)

Your selected dataframe has 14 columns.
There are 3 columns that have missing values.


Unnamed: 0,age,fnlwgt,education-num,capital-gain,capital-loss,hours-per-week,workclass_ Local-gov,workclass_ Never-worked,workclass_ Private,workclass_ Self-emp-inc,...,native-country_ Portugal,native-country_ Puerto-Rico,native-country_ Scotland,native-country_ South,native-country_ Taiwan,native-country_ Thailand,native-country_ Trinadad&Tobago,native-country_ United-States,native-country_ Vietnam,native-country_ Yugoslavia
0,39,77516,13,2174,0,40,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,50,83311,13,0,0,13,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,38,215646,9,0,0,40,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0
3,53,234721,7,0,0,40,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0
4,28,338409,13,0,0,40,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
5,37,284582,14,0,0,40,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0
6,49,160187,5,0,0,16,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
7,52,209642,9,0,0,45,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
8,31,45781,14,14084,0,50,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0
9,42,159449,13,5178,0,40,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0


In [6]:
def missing_values_table(df):
   
    # Total missing values
    mis_val = df.isnull().sum()
    
    # Percentage of missing values
    mis_val_percent = 100 * df.isnull().sum() / len(df)
    
    # Make a table with the results
    mis_val_table = pd.concat([mis_val, mis_val_percent], axis=1)
    
    # Rename the columns
    mis_val_table_ren_columns = mis_val_table.rename(
    columns = {0 : 'Missing Values', 1 : '% of Total Values'})
    
    # Sort the table by percentage of missing descending
    # .iloc[:, 1]!= 0: filter on missing missing values not equal to zero
    mis_val_table_ren_columns = mis_val_table_ren_columns[
        mis_val_table_ren_columns.iloc[:,1] != 0].sort_values(
    '% of Total Values', ascending=False).round(2)  # round(2), keep 2 digits
    
    # Print some summary information
    print("Your selected dataframe has {} columns.".format(df.shape[1]) + '\n' + 
    "There are {} columns that have missing values.".format(mis_val_table_ren_columns.shape[0]))
    
    # Return the dataframe with missing information
    return mis_val_table_ren_columns

<h2> Subtasks for Data set 1 (Ionosphere)</h2>


In [20]:
#df is dataframe 
df = pd.read_csv('./IonosphereDataSet/ionosphere.data',header=None)

print(df)

df.describe()



     0   1        2        3        4        5        6        7        8   \
0     1   0  0.99539 -0.05889  0.85243  0.02306  0.83398 -0.37708  1.00000   
1     1   0  1.00000 -0.18829  0.93035 -0.36156 -0.10868 -0.93597  1.00000   
2     1   0  1.00000 -0.03365  1.00000  0.00485  1.00000 -0.12062  0.88965   
3     1   0  1.00000 -0.45161  1.00000  1.00000  0.71216 -1.00000  0.00000   
4     1   0  1.00000 -0.02401  0.94140  0.06531  0.92106 -0.23255  0.77152   
..   ..  ..      ...      ...      ...      ...      ...      ...      ...   
346   1   0  0.83508  0.08298  0.73739 -0.14706  0.84349 -0.05567  0.90441   
347   1   0  0.95113  0.00419  0.95183 -0.02723  0.93438 -0.01920  0.94590   
348   1   0  0.94701 -0.00034  0.93207 -0.03227  0.95177 -0.03431  0.95584   
349   1   0  0.90608 -0.01657  0.98122 -0.01989  0.95691 -0.03646  0.85746   
350   1   0  0.84710  0.13533  0.73638 -0.06151  0.87873  0.08260  0.88928   

          9   ...       25       26       27       28       29 

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,24,25,26,27,28,29,30,31,32,33
count,351.0,351.0,351.0,351.0,351.0,351.0,351.0,351.0,351.0,351.0,...,351.0,351.0,351.0,351.0,351.0,351.0,351.0,351.0,351.0,351.0
mean,0.891738,0.0,0.641342,0.044372,0.601068,0.115889,0.550095,0.11936,0.511848,0.181345,...,0.396135,-0.071187,0.541641,-0.069538,0.378445,-0.027907,0.352514,-0.003794,0.349364,0.01448
std,0.311155,0.0,0.497708,0.441435,0.519862,0.46081,0.492654,0.52075,0.507066,0.483851,...,0.578451,0.508495,0.516205,0.550025,0.575886,0.507974,0.571483,0.513574,0.522663,0.468337
min,0.0,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
25%,1.0,0.0,0.472135,-0.064735,0.41266,-0.024795,0.21131,-0.05484,0.08711,-0.048075,...,0.0,-0.33239,0.286435,-0.443165,0.0,-0.236885,0.0,-0.242595,0.0,-0.16535
50%,1.0,0.0,0.87111,0.01631,0.8092,0.0228,0.72873,0.01471,0.68421,0.01829,...,0.55389,-0.01505,0.70824,-0.01769,0.49664,0.0,0.44277,0.0,0.40956,0.0
75%,1.0,0.0,1.0,0.194185,1.0,0.334655,0.96924,0.445675,0.95324,0.534195,...,0.90524,0.156765,0.999945,0.153535,0.883465,0.154075,0.85762,0.20012,0.813765,0.17166
max,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
