In [1026]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder, StandardScaler
import tensorflow as tf

In [1027]:
train_df = pd.read_csv('D:\Events\VIL Codefest\secret\VIL Confidential Information Dataset\Train.csv')
test_df = pd.read_csv('D:\Events\VIL Codefest\secret\VIL Confidential Information Dataset\Test.csv')

In [1028]:
train_df.head()

Unnamed: 0,Age,ARPU,Age on Network,Bill Payment Aggregator,Brand Identifier,Circle Name,Connection Type,Data Usage,Gender,Genre,ID,International Usage,OTT Content App,Pincode,Recharge,SMS Usage,VAS Subscription,Voice Usage,Web/App,DND
0,24-28,0-50,24,,Vodafone,Kolkata,Prepaid,10-15,M,Game Series,VFMUKOL38741,,Yes,700040.0,501-1000,51-100,HOICHOI PARTNER PACK,501-750,Non_User,N
1,39-43,0-50,<3,,Vodafone,Kolkata,Prepaid,<1,M,Marathi Drama,VFMUKOL54162,,Yes,700038.0,101-300,1-25,,751-1000,Non_User,N
2,29-33,101-200,>24,,Idea,Delhi,Prepaid,10-15,M,Marathi Drama,IDKRDL59718,,Yes,110053.0,101-200,1-25,,>1000,Non_User,Y
3,>58,>750,18-24,,Vodafone,Mumbai,Prepaid,>15,F,Game Series,VFMUKOL5295,Yes,Yes,410218.0,201-300,1-25,,501-750,Non_User,Y
4,>58,301-500,3-6,301-500,Vodafone,Kolkata,Postpaid,10-15,O,Marathi Drama,VFMUKOL60365,,Yes,700091.0,,26-50,HOICHOI PARTNER PACK,501-750,Non_User,N


# DROPPING

In [1029]:
train_df.drop(['Bill Payment Aggregator', 'OTT Content App', 'Pincode', 'ID'], axis=1, inplace=True)

In [1030]:
test_df.drop(['Bill Payment Aggregator', 'OTT Content App', 'Pincode', 'ID'], axis=1, inplace=True)

In [1031]:
test_df.head()

Unnamed: 0,Age,ARPU,Age on Network,Brand Identifier,Circle Name,Connection Type,Data Usage,Gender,Genre,International Usage,Recharge,SMS Usage,VAS Subscription,Voice Usage,Web/App,DND
0,>58,0-50,24,Vodafone,Kolkata,Prepaid,<1,M,Marathi Drama,,1-50,1-25,,0-50,Non_User,N
1,39-43,101-200,18-24,Vodafone,Kolkata,Prepaid,10-15,F,Marathi Drama,,101-300,1-25,MOVIES AND TV TRAIL PACK GOT ACTIVATED WITHOUT...,101-200,Non_User,N
2,39-43,501-750,>24,Idea,Kerala,Postpaid,7.5-10,M,Game Series,,,1-25,,>1000,APP,N
3,39-43,>750,6-9,Idea,Kerala,Postpaid,3-5,M,Game Series,,,0,,101-200,APP,N
4,>58,>750,>24,Idea,Kerala,Postpaid,10-15,M,Game Series,,,1-25,,>1000,Non_User,N


# Missing Values

In [1032]:
train_df.isnull().sum()

Age                      2824
ARPU                        0
Age on Network           1361
Brand Identifier            0
Circle Name                 0
Connection Type             0
Data Usage                  0
Gender                   3288
Genre                       0
International Usage    107735
Recharge                27715
SMS Usage                   0
VAS Subscription        80286
Voice Usage                 0
Web/App                     0
DND                         0
dtype: int64

In [1033]:
test_df.isnull().sum()

Age                     1175
ARPU                       0
Age on Network           585
Brand Identifier           0
Circle Name                0
Connection Type            0
Data Usage                 0
Gender                  1421
Genre                      0
International Usage    46220
Recharge               11743
SMS Usage                  0
VAS Subscription       34404
Voice Usage                0
Web/App                    0
DND                        0
dtype: int64

### International Usage & VAS Subscription

In [1034]:
train_df['International Usage'].fillna('No', inplace=True)
test_df['International Usage'].fillna('No', inplace=True)

In [1035]:
train_df['VAS Subscription'].fillna('None', inplace=True)
test_df['VAS Subscription'].fillna('None', inplace=True)

# Dropna

In [1036]:
nan_train_df = train_df.dropna()
nan_test_df = test_df.dropna()
nan_df = nan_train_df.append(nan_test_df, ignore_index=True)
nan_df.shape

(118282, 16)

# Label Encoding

In [1037]:
train_df.fillna('NaN', inplace=True)

In [1038]:
le = LabelEncoder()

### 01 - Age

In [1039]:
train_df['Age'] = le.fit_transform(train_df['Age'])

In [1040]:
le_age_mapping = dict(zip(le.classes_, le.transform(le.classes_)))

In [1041]:
train_df['Age'].replace(le_age_mapping['NaN'], -1, inplace=True)

In [1042]:
test_df['Age'].replace(le_age_mapping, inplace=True)

### 02 - ARPU

In [1043]:
train_df['ARPU'] = le.fit_transform(train_df['ARPU'])
le_arpu_mapping = dict(zip(le.classes_, le.transform(le.classes_)))

In [1044]:
le_arpu_mapping

{'0': 0,
 '0-50': 1,
 '101-200': 2,
 '201-300': 3,
 '301-500': 4,
 '501-750': 5,
 '51-100': 6,
 '>750': 7}

In [1045]:
test_df['ARPU'].replace(le_arpu_mapping, inplace=True)

### 03 - Age on Network

In [1046]:
train_df['Age on Network'] = le.fit_transform(train_df['Age on Network'])
le_aon_mapping = dict(zip(le.classes_, le.transform(le.classes_)))

In [1047]:
train_df['Age on Network'].replace(le_aon_mapping['NaN'], -1, inplace=True)

In [1048]:
test_df['Age on Network'].replace(le_aon_mapping, inplace=True)

### 04 - Brand Identifier

In [1049]:
train_df['Brand Identifier'] = le.fit_transform(train_df['Brand Identifier'])
le_brand_mapping = dict(zip(le.classes_, le.transform(le.classes_)))

In [1050]:
test_df['Brand Identifier'].replace(le_brand_mapping, inplace=True)

### 05 - Circle Name

In [1051]:
train_df['Circle Name'] = le.fit_transform(train_df['Circle Name'])
le_circle_mapping = dict(zip(le.classes_, le.transform(le.classes_)))

In [1052]:
test_df['Circle Name'].replace(le_circle_mapping, inplace=True)

### 06 - Connection Type

In [1053]:
train_df['Connection Type'] = le.fit_transform(train_df['Connection Type'])
le_connection_mapping = dict(zip(le.classes_, le.transform(le.classes_)))

In [1054]:
test_df['Connection Type'].replace(le_connection_mapping, inplace=True)

### 07 - Data Usage

In [1055]:
train_df['Data Usage'] = le.fit_transform(train_df['Data Usage'])
le_data_mapping = dict(zip(le.classes_, le.transform(le.classes_)))

In [1056]:
test_df['Data Usage'].replace(le_data_mapping, inplace=True)

### 08 - Gender

In [1057]:
train_df['Gender'] = le.fit_transform(train_df['Gender'])
le_gender_mapping = dict(zip(le.classes_, le.transform(le.classes_)))

In [1058]:
train_df['Gender'].replace(le_gender_mapping['NaN'], -1, inplace=True)

In [1059]:
test_df['Gender'].replace(le_gender_mapping, inplace=True)

### 09 - Genre

In [1060]:
train_df['Genre'] = le.fit_transform(train_df['Genre'])
le_genre_mapping = dict(zip(le.classes_, le.transform(le.classes_)))

In [1061]:
test_df['Genre'].replace(le_genre_mapping, inplace=True)

### 10 - International Usage

In [1062]:
train_df['International Usage'] = le.fit_transform(train_df['International Usage'])
le_international_mapping = dict(zip(le.classes_, le.transform(le.classes_)))

In [1063]:
test_df['International Usage'].replace(le_international_mapping, inplace=True)

### 11 - Recharge

In [1064]:
train_df['Recharge'] = le.fit_transform(train_df['Recharge'])
le_recharge_mapping = dict(zip(le.classes_, le.transform(le.classes_)))

In [1065]:
train_df['Recharge'].replace(le_recharge_mapping['NaN'], -1, inplace=True)

In [1066]:
test_df['Recharge'].replace(le_recharge_mapping, inplace=True)

### 12 - SMS Usage

In [1067]:
train_df['SMS Usage'] = le.fit_transform(train_df['SMS Usage'])
le_sms_mapping = dict(zip(le.classes_, le.transform(le.classes_)))

In [1068]:
test_df['SMS Usage'].replace(le_sms_mapping, inplace=True)

### 13 - VAS Subscription

In [1069]:
train_df['VAS Subscription'] = le.fit_transform(train_df['VAS Subscription'])
le_vas_mapping = dict(zip(le.classes_, le.transform(le.classes_)))

In [1070]:
test_df['VAS Subscription'].replace(le_vas_mapping, inplace=True)

### 14 - Voice Usage

In [1071]:
train_df['Voice Usage'] = le.fit_transform(train_df['Voice Usage'])
le_voice_mapping = dict(zip(le.classes_, le.transform(le.classes_)))

In [1072]:
test_df['Voice Usage'].replace(le_voice_mapping, inplace=True)

### 15 - Web/App

In [1073]:
train_df['Web/App'] = le.fit_transform(train_df['Web/App'])
le_webapp_mapping = dict(zip(le.classes_, le.transform(le.classes_)))

In [1074]:
test_df['Web/App'].replace(le_webapp_mapping, inplace=True)

### 16 - DND

In [1075]:
train_df['DND'] = le.fit_transform(train_df['DND'])
le_dnd_mapping = dict(zip(le.classes_, le.transform(le.classes_)))

In [1076]:
test_df['DND'].replace(le_dnd_mapping, inplace=True)

### Test Data

In [1090]:
test_df.replace('NaN', -1, inplace=True)

In [1102]:
test_df['Recharge'] = test_df['Recharge'].astype('int')
test_df['Age'] = test_df['Age'].astype('int')
test_df['Age on Network'] = test_df['Age on Network'].astype('int')
test_df['Gender'] = test_df['Gender'].astype('int')

In [1103]:
test_df.head()

Unnamed: 0,Age,ARPU,Age on Network,Brand Identifier,Circle Name,Connection Type,Data Usage,Gender,Genre,International Usage,Recharge,SMS Usage,VAS Subscription,Voice Usage,Web/App,DND
0,9,1,2,1,2,1,6,1,3,0,1,1,87,1,1,0
1,4,2,1,1,2,1,2,0,3,0,3,1,85,2,1,0
2,4,5,7,0,1,0,5,1,2,0,-1,1,87,8,0,0
3,4,7,4,0,1,0,3,1,2,0,-1,0,87,2,0,0
4,9,7,7,0,1,0,2,1,2,0,-1,1,87,8,1,0


# To The CSV

In [1104]:
train_df.to_csv('D:\Events\VIL Codefest\secret\VIL Confidential Information Dataset\Train_cleaned.csv')

In [1105]:
test_df.to_csv('D:\Events\VIL Codefest\secret\VIL Confidential Information Dataset\Test_cleaned.csv')