# Data Preprocessing

In [6]:
import pandas as pd
import numpy as np 
import matplotlib as mlt

In [7]:
df = pd.read_csv("cell2celltrain.csv")

In [8]:
# identify columns with non-numeric data:

non_numerical = []

for column in df.columns:
    if df[column].dtype == 'object':
        non_numerical.append(column)
        
print("Columns having Non Numerical data are :")
print(non_numerical)
print(len(non_numerical))
    
    

Columns having Non Numerical data are :
['Churn', 'ServiceArea', 'ChildrenInHH', 'HandsetRefurbished', 'HandsetWebCapable', 'TruckOwner', 'RVOwner', 'Homeownership', 'BuysViaMailOrder', 'RespondsToMailOffers', 'OptOutMailings', 'NonUSTravel', 'OwnsComputer', 'HasCreditCard', 'NewCellphoneUser', 'NotNewCellphoneUser', 'OwnsMotorcycle', 'HandsetPrice', 'MadeCallToRetentionTeam', 'CreditRating', 'PrizmCode', 'Occupation', 'MaritalStatus']
23


In [9]:
# Describing a perticaular colloumn 

non_numeric_description = df['ServiceArea'].describe(include='object')
print("\nDescriptive statistics for the non-numeric column 'column_name':")
print(non_numeric_description)


Descriptive statistics for the non-numeric column 'column_name':
count         51023
unique          747
top       NYCBRO917
freq           1684
Name: ServiceArea, dtype: object


In [10]:
# Number of Unique element in a column 

num_of_unique = df['ServiceArea'].nunique()

print("Number of unique values",num_of_unique)

Number of unique values 747


In [11]:
# Only keeping the first three characters for each value in the ServiceArea columns 

df['ServiceArea'] = df["ServiceArea"].str[:3]
print(df['ServiceArea'])

0        SEA
1        PIT
2        MIL
3        PIT
4        OKC
        ... 
51042    LAX
51043    LAX
51044    LAX
51045    NEV
51046    NEV
Name: ServiceArea, Length: 51047, dtype: object


In [12]:
unique = df['ServiceArea'].unique()
print(unique)

['SEA' 'PIT' 'MIL' 'OKC' 'SAN' 'SLC' 'LOU' 'KCY' 'DEN' 'PHI' 'OMA' 'IND'
 'NSH' 'PHX' 'DAL' 'NYC' 'NOL' 'MIN' 'NNY' 'BOS' 'DET' 'STL' 'MIA' 'BIR'
 'SFR' 'HAR' 'APC' nan 'FLN' 'NEV' 'OHI' 'ATL' 'NCR' 'HOU' 'CHI' 'SFU'
 'HWI' 'LAX' 'NMX' 'SHE' 'SDA' 'AIR' 'LAU' 'NOR' 'ATH' 'SEW' 'OHH' 'INH'
 'NMC' 'IPM' 'NVU' 'AWI' 'VAH' 'HOP' 'INU' 'GCW' 'LAW' 'SLU']


In [13]:
missing_values = df['ServiceArea'].isna().sum()

print("The count of missing values",missing_values)

The count of missing values 24


In [14]:
#one hot encoding ServiceArea column 

one_hotEncoding_ServiceArea = pd.get_dummies(df['ServiceArea'],prefix='ServiceArea')

df_encoded = pd.concat([df,one_hotEncoding_ServiceArea],axis=1)

df_encoded.drop('ServiceArea',axis=1,inplace=True)

print(df_encoded.head())



   CustomerID Churn  MonthlyRevenue  MonthlyMinutes  TotalRecurringCharge  \
0     3000002   Yes           24.00           219.0                  22.0   
1     3000010   Yes           16.99            10.0                  17.0   
2     3000014    No           38.00             8.0                  38.0   
3     3000022    No           82.28          1312.0                  75.0   
4     3000026   Yes           17.14             0.0                  17.0   

   DirectorAssistedCalls  OverageMinutes  RoamingCalls  PercChangeMinutes  \
0                   0.25             0.0           0.0             -157.0   
1                   0.00             0.0           0.0               -4.0   
2                   0.00             0.0           0.0               -2.0   
3                   1.24             0.0           0.0              157.0   
4                   0.00             0.0           0.0                0.0   

   PercChangeRevenues  ...  ServiceArea_SDA  ServiceArea_SEA  ServiceArea_

In [15]:

boolen_columns = df_encoded.select_dtypes(include='bool').columns

print("Columns with true and false values")
for column in boolen_columns:
    print(column)

Columns with true and false values
ServiceArea_AIR
ServiceArea_APC
ServiceArea_ATH
ServiceArea_ATL
ServiceArea_AWI
ServiceArea_BIR
ServiceArea_BOS
ServiceArea_CHI
ServiceArea_DAL
ServiceArea_DEN
ServiceArea_DET
ServiceArea_FLN
ServiceArea_GCW
ServiceArea_HAR
ServiceArea_HOP
ServiceArea_HOU
ServiceArea_HWI
ServiceArea_IND
ServiceArea_INH
ServiceArea_INU
ServiceArea_IPM
ServiceArea_KCY
ServiceArea_LAU
ServiceArea_LAW
ServiceArea_LAX
ServiceArea_LOU
ServiceArea_MIA
ServiceArea_MIL
ServiceArea_MIN
ServiceArea_NCR
ServiceArea_NEV
ServiceArea_NMC
ServiceArea_NMX
ServiceArea_NNY
ServiceArea_NOL
ServiceArea_NOR
ServiceArea_NSH
ServiceArea_NVU
ServiceArea_NYC
ServiceArea_OHH
ServiceArea_OHI
ServiceArea_OKC
ServiceArea_OMA
ServiceArea_PHI
ServiceArea_PHX
ServiceArea_PIT
ServiceArea_SAN
ServiceArea_SDA
ServiceArea_SEA
ServiceArea_SEW
ServiceArea_SFR
ServiceArea_SFU
ServiceArea_SHE
ServiceArea_SLC
ServiceArea_SLU
ServiceArea_STL
ServiceArea_VAH


In [16]:
rows_with_na_values  = df[df.isna().any(axis=1)]

print(rows_with_na_values)


       CustomerID Churn  MonthlyRevenue  MonthlyMinutes  TotalRecurringCharge  \
62        3000410   Yes           90.25           952.0                  50.0   
87        3000598    No          122.00          1806.0                  75.0   
91        3000626    No           96.04           545.0                  60.0   
122       3000898   Yes             NaN             NaN                   NaN   
126       3000926   Yes             NaN             NaN                   NaN   
...           ...   ...             ...             ...                   ...   
51039     3399922    No           50.00           492.0                  50.0   
51042     3399958   Yes             NaN             NaN                   NaN   
51044     3399978   Yes             NaN             NaN                   NaN   
51045     3399990    No             NaN             NaN                   NaN   
51046     3399994    No             NaN             NaN                   NaN   

       DirectorAssistedCall

In [17]:
boolean_columns = df_encoded.select_dtypes(include='bool')

# Encode boolean columns to binary 0 and 1
df[boolean_columns.columns] = boolean_columns.astype(int)

# Print the DataFrame with boolean columns encoded to binary
print(df)




       CustomerID Churn  MonthlyRevenue  MonthlyMinutes  TotalRecurringCharge  \
0         3000002   Yes           24.00           219.0                  22.0   
1         3000010   Yes           16.99            10.0                  17.0   
2         3000014    No           38.00             8.0                  38.0   
3         3000022    No           82.28          1312.0                  75.0   
4         3000026   Yes           17.14             0.0                  17.0   
...           ...   ...             ...             ...                   ...   
51042     3399958   Yes             NaN             NaN                   NaN   
51043     3399974    No           95.17          1745.0                  85.0   
51044     3399978   Yes             NaN             NaN                   NaN   
51045     3399990    No             NaN             NaN                   NaN   
51046     3399994    No             NaN             NaN                   NaN   

       DirectorAssistedCall

In [19]:
df.dropna(inplace=True)
df.drop(columns=['ServiceArea_freq_encoded'], inplace=True)


KeyError: "['ServiceArea_freq_encoded'] not found in axis"

In [20]:
#unique values in the CreditRatings coloumns 

non_numeric_description = df['CreditRating'].describe(include='object')
print("\nDescriptive statistics for the non-numeric column 'column_name':")
print(non_numeric_description)


Descriptive statistics for the non-numeric column 'column_name':
count      49752
unique         7
top       2-High
freq       18692
Name: CreditRating, dtype: object


In [21]:
unique = df['CreditRating'].unique()

print('All the unique values in CreditRating')
print(unique)

All the unique values in CreditRating
['1-Highest' '4-Medium' '3-Good' '2-High' '5-Low' '6-VeryLow' '7-Lowest']


In [23]:

from sklearn.preprocessing import LabelEncoder

# Create a LabelEncoder object
label_encoder = LabelEncoder()

# Fit the LabelEncoder to your unique values and transform the column
encoded_values = label_encoder.fit_transform(df['CreditRating'])

# Print the mapping of original values to encoded values
print("Mapping of original values to encoded values:")
for original, encoded in zip(df['CreditRating'], encoded_values):
    print(original, '-->', encoded)

# Replace the column in the DataFrame with the encoded values
df['CreditRating'] = encoded_values

# Print the DataFrame with the encoded column
print("\nDataFrame with the encoded column:")
print(df)

df.to_csv('cell2celltrain.csv')

Mapping of original values to encoded values:
0 --> 0
3 --> 3
2 --> 2
3 --> 3
0 --> 0
2 --> 2
0 --> 0
0 --> 0
0 --> 0
2 --> 2
0 --> 0
3 --> 3
2 --> 2
2 --> 2
2 --> 2
0 --> 0
0 --> 0
2 --> 2
0 --> 0
3 --> 3
0 --> 0
0 --> 0
0 --> 0
0 --> 0
0 --> 0
0 --> 0
0 --> 0
0 --> 0
0 --> 0
2 --> 2
0 --> 0
2 --> 2
0 --> 0
0 --> 0
2 --> 2
0 --> 0
0 --> 0
0 --> 0
0 --> 0
2 --> 2
2 --> 2
2 --> 2
3 --> 3
3 --> 3
3 --> 3
3 --> 3
0 --> 0
0 --> 0
3 --> 3
0 --> 0
0 --> 0
3 --> 3
2 --> 2
0 --> 0
3 --> 3
0 --> 0
0 --> 0
0 --> 0
3 --> 3
0 --> 0
0 --> 0
0 --> 0
0 --> 0
0 --> 0
0 --> 0
0 --> 0
3 --> 3
0 --> 0
0 --> 0
0 --> 0
2 --> 2
0 --> 0
0 --> 0
2 --> 2
1 --> 1
2 --> 2
0 --> 0
2 --> 2
0 --> 0
4 --> 4
5 --> 5
0 --> 0
2 --> 2
0 --> 0
2 --> 2
2 --> 2
2 --> 2
5 --> 5
0 --> 0
0 --> 0
0 --> 0
2 --> 2
0 --> 0
0 --> 0
0 --> 0
2 --> 2
0 --> 0
2 --> 2
2 --> 2
2 --> 2
2 --> 2
5 --> 5
0 --> 0
0 --> 0
2 --> 2
2 --> 2
6 --> 6
0 --> 0
0 --> 0
0 --> 0
3 --> 3
0 --> 0
0 --> 0
3 --> 3
0 --> 0
0 --> 0
1 --> 1
0 --> 0
0 --> 0
1 

In [24]:
unique = df['PrizmCode'].unique()

print("Unique value in PrizmCode")
print(unique)

Unique value in PrizmCode
['Suburban' 'Town' 'Other' 'Rural']


In [25]:
hot_encoding = pd.get_dummies(df['PrizmCode'],prefix="PrizmCode")

df = pd.concat([df,hot_encoding],axis=1)

df.drop('PrizmCode',axis=1,inplace=True)

df.head()

Unnamed: 0,CustomerID,Churn,MonthlyRevenue,MonthlyMinutes,TotalRecurringCharge,DirectorAssistedCalls,OverageMinutes,RoamingCalls,PercChangeMinutes,PercChangeRevenues,...,ServiceArea_SFU,ServiceArea_SHE,ServiceArea_SLC,ServiceArea_SLU,ServiceArea_STL,ServiceArea_VAH,PrizmCode_Other,PrizmCode_Rural,PrizmCode_Suburban,PrizmCode_Town
0,3000002,Yes,24.0,219.0,22.0,0.25,0.0,0.0,-157.0,-19.0,...,0,0,0,0,0,0,False,False,True,False
1,3000010,Yes,16.99,10.0,17.0,0.0,0.0,0.0,-4.0,0.0,...,0,0,0,0,0,0,False,False,True,False
2,3000014,No,38.0,8.0,38.0,0.0,0.0,0.0,-2.0,0.0,...,0,0,0,0,0,0,False,False,False,True
3,3000022,No,82.28,1312.0,75.0,1.24,0.0,0.0,157.0,8.1,...,0,0,0,0,0,0,True,False,False,False
4,3000026,Yes,17.14,0.0,17.0,0.0,0.0,0.0,0.0,-0.2,...,0,0,0,0,0,0,True,False,False,False


In [26]:
boolen_columns = df.select_dtypes(include='bool')

df[boolean_columns.columns] = boolean_columns.astype(int)

# Print the DataFrame with boolean columns encoded to binary
print(df)

       CustomerID Churn  MonthlyRevenue  MonthlyMinutes  TotalRecurringCharge  \
0         3000002   Yes           24.00           219.0                  22.0   
1         3000010   Yes           16.99            10.0                  17.0   
2         3000014    No           38.00             8.0                  38.0   
3         3000022    No           82.28          1312.0                  75.0   
4         3000026   Yes           17.14             0.0                  17.0   
...           ...   ...             ...             ...                   ...   
51035     3399894    No            0.00            76.0                  30.0   
51037     3399906    No           31.92            63.0                  17.0   
51040     3399942    No           71.99           724.0                  70.0   
51041     3399946   Yes          117.49           384.0                  30.0   
51043     3399974    No           95.17          1745.0                  85.0   

       DirectorAssistedCall

In [27]:
column_to_convert = ['PrizmCode_Other' , 'PrizmCode_Rural' , 'PrizmCode_Suburban' , 'PrizmCode_Town']

df[column_to_convert] = df[column_to_convert].replace({True:1,False:0})
df[column_to_convert] = df[column_to_convert].astype(int)
df.head()

  df[column_to_convert] = df[column_to_convert].replace({True:1,False:0})


Unnamed: 0,CustomerID,Churn,MonthlyRevenue,MonthlyMinutes,TotalRecurringCharge,DirectorAssistedCalls,OverageMinutes,RoamingCalls,PercChangeMinutes,PercChangeRevenues,...,ServiceArea_SFU,ServiceArea_SHE,ServiceArea_SLC,ServiceArea_SLU,ServiceArea_STL,ServiceArea_VAH,PrizmCode_Other,PrizmCode_Rural,PrizmCode_Suburban,PrizmCode_Town
0,3000002,Yes,24.0,219.0,22.0,0.25,0.0,0.0,-157.0,-19.0,...,0,0,0,0,0,0,0,0,1,0
1,3000010,Yes,16.99,10.0,17.0,0.0,0.0,0.0,-4.0,0.0,...,0,0,0,0,0,0,0,0,1,0
2,3000014,No,38.0,8.0,38.0,0.0,0.0,0.0,-2.0,0.0,...,0,0,0,0,0,0,0,0,0,1
3,3000022,No,82.28,1312.0,75.0,1.24,0.0,0.0,157.0,8.1,...,0,0,0,0,0,0,1,0,0,0
4,3000026,Yes,17.14,0.0,17.0,0.0,0.0,0.0,0.0,-0.2,...,0,0,0,0,0,0,1,0,0,0


In [28]:
unique = df['Occupation'].unique()

print(unique)

['Professional' 'Crafts' 'Other' 'Self' 'Retired' 'Homemaker' 'Clerical'
 'Student']


In [29]:
encoding = pd.get_dummies(df['Occupation'],prefix="Occupation")

df = pd.concat([df,encoding],axis=1)

df.drop('Occupation',axis=1,inplace=True)

df.head()

Unnamed: 0,CustomerID,Churn,MonthlyRevenue,MonthlyMinutes,TotalRecurringCharge,DirectorAssistedCalls,OverageMinutes,RoamingCalls,PercChangeMinutes,PercChangeRevenues,...,PrizmCode_Suburban,PrizmCode_Town,Occupation_Clerical,Occupation_Crafts,Occupation_Homemaker,Occupation_Other,Occupation_Professional,Occupation_Retired,Occupation_Self,Occupation_Student
0,3000002,Yes,24.0,219.0,22.0,0.25,0.0,0.0,-157.0,-19.0,...,1,0,False,False,False,False,True,False,False,False
1,3000010,Yes,16.99,10.0,17.0,0.0,0.0,0.0,-4.0,0.0,...,1,0,False,False,False,False,True,False,False,False
2,3000014,No,38.0,8.0,38.0,0.0,0.0,0.0,-2.0,0.0,...,0,1,False,True,False,False,False,False,False,False
3,3000022,No,82.28,1312.0,75.0,1.24,0.0,0.0,157.0,8.1,...,0,0,False,False,False,True,False,False,False,False
4,3000026,Yes,17.14,0.0,17.0,0.0,0.0,0.0,0.0,-0.2,...,0,0,False,False,False,False,True,False,False,False


In [30]:
columns_to_convert = ['Occupation_Clerical', 'Occupation_Crafts', 'Occupation_Homemaker', 'Occupation_Other',
                      'Occupation_Professional', 'Occupation_Retired', 'Occupation_Self', 'Occupation_Student']

# Convert 'True' and 'False' values to 1 and 0
df[columns_to_convert] = df[columns_to_convert].replace({True: 1, False: 0})

# Explicitly cast DataFrame to the desired data type
df[columns_to_convert] = df[columns_to_convert].astype(int)

# Print the DataFrame after conversion
print(df)
df.head()

  df[columns_to_convert] = df[columns_to_convert].replace({True: 1, False: 0})


       CustomerID Churn  MonthlyRevenue  MonthlyMinutes  TotalRecurringCharge  \
0         3000002   Yes           24.00           219.0                  22.0   
1         3000010   Yes           16.99            10.0                  17.0   
2         3000014    No           38.00             8.0                  38.0   
3         3000022    No           82.28          1312.0                  75.0   
4         3000026   Yes           17.14             0.0                  17.0   
...           ...   ...             ...             ...                   ...   
51035     3399894    No            0.00            76.0                  30.0   
51037     3399906    No           31.92            63.0                  17.0   
51040     3399942    No           71.99           724.0                  70.0   
51041     3399946   Yes          117.49           384.0                  30.0   
51043     3399974    No           95.17          1745.0                  85.0   

       DirectorAssistedCall

Unnamed: 0,CustomerID,Churn,MonthlyRevenue,MonthlyMinutes,TotalRecurringCharge,DirectorAssistedCalls,OverageMinutes,RoamingCalls,PercChangeMinutes,PercChangeRevenues,...,PrizmCode_Suburban,PrizmCode_Town,Occupation_Clerical,Occupation_Crafts,Occupation_Homemaker,Occupation_Other,Occupation_Professional,Occupation_Retired,Occupation_Self,Occupation_Student
0,3000002,Yes,24.0,219.0,22.0,0.25,0.0,0.0,-157.0,-19.0,...,1,0,0,0,0,0,1,0,0,0
1,3000010,Yes,16.99,10.0,17.0,0.0,0.0,0.0,-4.0,0.0,...,1,0,0,0,0,0,1,0,0,0
2,3000014,No,38.0,8.0,38.0,0.0,0.0,0.0,-2.0,0.0,...,0,1,0,1,0,0,0,0,0,0
3,3000022,No,82.28,1312.0,75.0,1.24,0.0,0.0,157.0,8.1,...,0,0,0,0,0,1,0,0,0,0
4,3000026,Yes,17.14,0.0,17.0,0.0,0.0,0.0,0.0,-0.2,...,0,0,0,0,0,0,1,0,0,0


In [31]:
df['HandsetPrice'].describe()

count       49752
unique         16
top       Unknown
freq        28263
Name: HandsetPrice, dtype: object

In [32]:
#Droping HandsetPrice column as there are about 56.8% value as Unknown

df.drop(columns=['HandsetPrice'],inplace=True)


In [33]:
df.head()

Unnamed: 0,CustomerID,Churn,MonthlyRevenue,MonthlyMinutes,TotalRecurringCharge,DirectorAssistedCalls,OverageMinutes,RoamingCalls,PercChangeMinutes,PercChangeRevenues,...,PrizmCode_Suburban,PrizmCode_Town,Occupation_Clerical,Occupation_Crafts,Occupation_Homemaker,Occupation_Other,Occupation_Professional,Occupation_Retired,Occupation_Self,Occupation_Student
0,3000002,Yes,24.0,219.0,22.0,0.25,0.0,0.0,-157.0,-19.0,...,1,0,0,0,0,0,1,0,0,0
1,3000010,Yes,16.99,10.0,17.0,0.0,0.0,0.0,-4.0,0.0,...,1,0,0,0,0,0,1,0,0,0
2,3000014,No,38.0,8.0,38.0,0.0,0.0,0.0,-2.0,0.0,...,0,1,0,1,0,0,0,0,0,0
3,3000022,No,82.28,1312.0,75.0,1.24,0.0,0.0,157.0,8.1,...,0,0,0,0,0,1,0,0,0,0
4,3000026,Yes,17.14,0.0,17.0,0.0,0.0,0.0,0.0,-0.2,...,0,0,0,0,0,0,1,0,0,0


In [34]:
df['Homeownership'].describe()


count     49752
unique        2
top       Known
freq      33725
Name: Homeownership, dtype: object

In [35]:
df['encoded_Homeownership'] = label_encoder.fit_transform(df['Homeownership'])

print("Mapping of original values to encoded values")

for original,encoded in zip(df['Homeownership'],df['encoded_Homeownership']):
    print(original,"-->",encoded)
    
print("\n DataFrame with the encoded columnn name")
df.head()

Mapping of original values to encoded values
Known --> 0
Known --> 0
Unknown --> 1
Known --> 0
Known --> 0
Known --> 0
Known --> 0
Known --> 0
Known --> 0
Known --> 0
Unknown --> 1
Known --> 0
Known --> 0
Known --> 0
Known --> 0
Known --> 0
Known --> 0
Unknown --> 1
Known --> 0
Known --> 0
Known --> 0
Known --> 0
Known --> 0
Known --> 0
Known --> 0
Known --> 0
Known --> 0
Known --> 0
Known --> 0
Known --> 0
Known --> 0
Known --> 0
Known --> 0
Known --> 0
Unknown --> 1
Known --> 0
Unknown --> 1
Known --> 0
Known --> 0
Known --> 0
Known --> 0
Known --> 0
Known --> 0
Known --> 0
Unknown --> 1
Unknown --> 1
Known --> 0
Known --> 0
Unknown --> 1
Known --> 0
Known --> 0
Known --> 0
Known --> 0
Known --> 0
Known --> 0
Known --> 0
Known --> 0
Known --> 0
Known --> 0
Known --> 0
Known --> 0
Known --> 0
Unknown --> 1
Known --> 0
Unknown --> 1
Unknown --> 1
Unknown --> 1
Known --> 0
Known --> 0
Known --> 0
Known --> 0
Known --> 0
Unknown --> 1
Known --> 0
Known --> 0
Known --> 0
Known --> 0
Known

Unnamed: 0,CustomerID,Churn,MonthlyRevenue,MonthlyMinutes,TotalRecurringCharge,DirectorAssistedCalls,OverageMinutes,RoamingCalls,PercChangeMinutes,PercChangeRevenues,...,PrizmCode_Town,Occupation_Clerical,Occupation_Crafts,Occupation_Homemaker,Occupation_Other,Occupation_Professional,Occupation_Retired,Occupation_Self,Occupation_Student,encoded_Homeownership
0,3000002,Yes,24.0,219.0,22.0,0.25,0.0,0.0,-157.0,-19.0,...,0,0,0,0,0,1,0,0,0,0
1,3000010,Yes,16.99,10.0,17.0,0.0,0.0,0.0,-4.0,0.0,...,0,0,0,0,0,1,0,0,0,0
2,3000014,No,38.0,8.0,38.0,0.0,0.0,0.0,-2.0,0.0,...,1,0,1,0,0,0,0,0,0,1
3,3000022,No,82.28,1312.0,75.0,1.24,0.0,0.0,157.0,8.1,...,0,0,0,0,1,0,0,0,0,0
4,3000026,Yes,17.14,0.0,17.0,0.0,0.0,0.0,0.0,-0.2,...,0,0,0,0,0,1,0,0,0,0


In [36]:
yes_no_columns = []

for column in df.columns:
    if 'Yes' in df[column].unique() and 'No' in df[column].unique():
        yes_no_columns.append(column)
        
print("Length is ", len(yes_no_columns))
print("Columns with 'Yes' and 'No' values:")
print(yes_no_columns)

Length is  17
Columns with 'Yes' and 'No' values:
['Churn', 'ChildrenInHH', 'HandsetRefurbished', 'HandsetWebCapable', 'TruckOwner', 'RVOwner', 'BuysViaMailOrder', 'RespondsToMailOffers', 'OptOutMailings', 'NonUSTravel', 'OwnsComputer', 'HasCreditCard', 'NewCellphoneUser', 'NotNewCellphoneUser', 'OwnsMotorcycle', 'MadeCallToRetentionTeam', 'MaritalStatus']


In [37]:
column_to_convert = ['Churn', 'ChildrenInHH', 'HandsetRefurbished', 'HandsetWebCapable', 'TruckOwner', 'RVOwner', 'BuysViaMailOrder', 'RespondsToMailOffers', 'OptOutMailings', 'NonUSTravel', 'OwnsComputer', 'HasCreditCard', 'NewCellphoneUser', 'NotNewCellphoneUser', 'OwnsMotorcycle', 'MadeCallToRetentionTeam', 'MaritalStatus']

df[column_to_convert] = df[column_to_convert].replace({'Yes':1,'No':0})



  df[column_to_convert] = df[column_to_convert].replace({'Yes':1,'No':0})


In [38]:
non_numerical = []

for column in df.columns:
    if df[column].dtype == "object":
        non_numerical.append(column)
        
print(non_numerical)
print(len(non_numerical))
    
    
    

['ServiceArea', 'Homeownership', 'MaritalStatus']
3


In [39]:
df['MaritalStatus'].describe()


count       49752
unique          3
top       Unknown
freq        18649
Name: MaritalStatus, dtype: object

In [40]:
# Droping feature MaritalStatus as there are 18649 unkown values 

df.drop(columns=['ServiceArea'], inplace=True)
print(df)

       CustomerID  Churn  MonthlyRevenue  MonthlyMinutes  \
0         3000002      1           24.00           219.0   
1         3000010      1           16.99            10.0   
2         3000014      0           38.00             8.0   
3         3000022      0           82.28          1312.0   
4         3000026      1           17.14             0.0   
...           ...    ...             ...             ...   
51035     3399894      0            0.00            76.0   
51037     3399906      0           31.92            63.0   
51040     3399942      0           71.99           724.0   
51041     3399946      1          117.49           384.0   
51043     3399974      0           95.17          1745.0   

       TotalRecurringCharge  DirectorAssistedCalls  OverageMinutes  \
0                      22.0                   0.25             0.0   
1                      17.0                   0.00             0.0   
2                      38.0                   0.00             0.0   

In [41]:
rows_with_missing_values =  df[df.isnull().any(axis=1)]

# Print the rows with missing values
print("Rows with missing NaN or null values:")
print(rows_with_missing_values)

Rows with missing NaN or null values:
Empty DataFrame
Columns: [CustomerID, Churn, MonthlyRevenue, MonthlyMinutes, TotalRecurringCharge, DirectorAssistedCalls, OverageMinutes, RoamingCalls, PercChangeMinutes, PercChangeRevenues, DroppedCalls, BlockedCalls, UnansweredCalls, CustomerCareCalls, ThreewayCalls, ReceivedCalls, OutboundCalls, InboundCalls, PeakCallsInOut, OffPeakCallsInOut, DroppedBlockedCalls, CallForwardingCalls, CallWaitingCalls, MonthsInService, UniqueSubs, ActiveSubs, Handsets, HandsetModels, CurrentEquipmentDays, AgeHH1, AgeHH2, ChildrenInHH, HandsetRefurbished, HandsetWebCapable, TruckOwner, RVOwner, Homeownership, BuysViaMailOrder, RespondsToMailOffers, OptOutMailings, NonUSTravel, OwnsComputer, HasCreditCard, RetentionCalls, RetentionOffersAccepted, NewCellphoneUser, NotNewCellphoneUser, ReferralsMadeBySubscriber, IncomeGroup, OwnsMotorcycle, AdjustmentsToCreditRating, MadeCallToRetentionTeam, CreditRating, MaritalStatus, ServiceArea_AIR, ServiceArea_APC, ServiceArea

In [42]:

df.to_csv("Encoded.csv")