### Step 1: Import The Data

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error

from datetime import datetime
from datetime import date

import warnings
warnings.filterwarnings('ignore')

In [None]:
df = pd.read_csv("AdvWorksCusts.csv")


In [None]:
avMoDf = pd.read_csv('AW_AveMonthSpend.csv')


In [None]:
bikeBuyerDf = pd.read_csv('AW_BikeBuyer.csv')

### Step 2: Understand The Data


In [None]:
print(df.shape[0])
print(df.shape[1])


In [None]:
df.isnull().sum()


In [None]:
df.dtypes


In [None]:
df.describe()


In [None]:
for column in df.columns:
    print(f"{column} : {len(df[column].unique())}")

### Step 3: Clean And Prepare The Data

#### DF

In [None]:
df.columns = [str.replace('-', '_') for str in df.columns]

In [None]:
df.drop('AddressLine2', axis = 1, inplace = True)
df.drop('MiddleName', axis = 1, inplace = True)
df.drop('Suffix', axis = 1, inplace = True)
df.drop('Title', axis = 1, inplace = True)

In [None]:
df = df.rename(columns={"BirthDate": "Age"})

def calculate_age(born):
    born = datetime.strptime(born, "%Y-%m-%d").date()
    date_year = 1998
    date_month = 1
    date_day = 1
    return date_year - born.year - ((date_month, date_day) < (born.month, born.day))

df['Age'] = df['Age'].apply(calculate_age)

df.head()

#### AVMODF

In [None]:
avMoDf.dropna()
avMoDf.drop_duplicates(subset = 'CustomerID', keep = 'first', inplace = True)

print(avMoDf.shape)
print(df.shape)

dataFrame = pd.merge(avMoDf, df, how='left', on=['CustomerID'])
dataFrame.head()

#### BIKEBUYERDF

In [None]:
bikeBuyerDf.drop_duplicates(subset = 'CustomerID', keep = 'first', inplace = True)

#### DATAFRAME

In [None]:
mask1 = (dataFrame['Gender'] == 'F') & (dataFrame['Age'] > 55)
mask2 = (dataFrame['Gender'] == 'M') & (dataFrame['Age'] > 55)
mask3 = (dataFrame['Gender'] == 'F') & (dataFrame['Age'] < 25)
mask4 = (dataFrame['Gender'] == 'M') & (dataFrame['Age'] < 25)
mask5 = (dataFrame['Gender'] == 'F') & (dataFrame['Age'] > 25) & (dataFrame['Age'] < 45)
mask6 = (dataFrame['Gender'] == 'M') & (dataFrame['Age'] > 25) & (dataFrame['Age'] > 45)

print('Females over 55 years of age   = ' + str(dataFrame[mask1]['AveMonthSpend'].median()))
print('Males over 55 years of age     = ' + str(dataFrame[mask2]['AveMonthSpend'].median()))
print('Females under 25 years of age  = ' + str(dataFrame[mask3]['AveMonthSpend'].median()))
print('Males under 25 years of age    = ' + str(dataFrame[mask4]['AveMonthSpend'].median()))
print('Females aged between 25 and 45 = ' + str(dataFrame[mask5]['AveMonthSpend'].median()))
print('Males aged between 25 and 45   = ' + str(dataFrame[mask6]['AveMonthSpend'].median()))

In [None]:
mask1 = (dataFrame['MaritalStatus'] == 'M')
mask2 = (dataFrame['MaritalStatus'] == 'S')

print('Married AveMonthSpend = ' + str(dataFrame[mask1]['AveMonthSpend'].median()))
print('Single AveMonthSpend  = ' + str(dataFrame[mask2]['AveMonthSpend'].median()))

In [None]:
mask1 = (dataFrame['NumberCarsOwned'] == 0)
mask2 = (dataFrame['NumberCarsOwned'] > 2)

print('Customers with no car           = ' + str(dataFrame[mask1]['AveMonthSpend'].median()))
print('Customers with more than 2 cars = ' + str(dataFrame[mask2]['AveMonthSpend'].median()))

In [None]:
mask1 = (dataFrame['Gender'] == 'M')
mask2 = (dataFrame['Gender'] == 'F')

print('Male AveMonthSpend Range    = ' + str(dataFrame[mask1]['AveMonthSpend'].max() - dataFrame[mask1]['AveMonthSpend'].min()))
print('Female AveMonthSpend Range  = ' + str(dataFrame[mask2]['AveMonthSpend'].max() - dataFrame[mask2]['AveMonthSpend'].min()))


In [None]:
mask1 = (dataFrame['NumberChildrenAtHome'] == 0)
mask2 = (dataFrame['NumberChildrenAtHome'] > 0)

print('Customers with no children           = ' + str(dataFrame[mask1]['AveMonthSpend'].median()))
print('Customers with one or more children  = ' + str(dataFrame[mask2]['AveMonthSpend'].median()))


#### NEWDF

In [None]:
newdf = pd.merge(dataFrame, bikeBuyerDf, how='left', on=['CustomerID'])

In [None]:
mask1 = (newdf['BikeBuyer'] == 0)
mask2 = (newdf['BikeBuyer'] == 1)

print('Avg. YearlyIncome for Did not buy a bike    = ' + str(newdf[mask1]['YearlyIncome'].median()))
print('Avg. YearlyIncome for Bought a bike         = ' + str(newdf[mask2]['YearlyIncome'].median()))

In [None]:
print('Avg. Cars Owned for Did not buy a bike    = ' + str(newdf[mask1]['NumberCarsOwned'].median()))
print('Avg. Cars Owned for Bought a bike         = ' + str(newdf[mask2]['NumberCarsOwned'].median()))

In [None]:
a = newdf[mask2].groupby('Occupation').agg(['count'])
a['BikeBuyer'].head()

In [None]:
mask1 = (newdf['Gender'] == 'M') & (newdf['BikeBuyer'] == 1)
mask2 = (newdf['Gender'] == 'F') & (newdf['BikeBuyer'] == 1)

print('Number of male customers who bought bike   = ' + str(newdf[mask1]['BikeBuyer'].agg(['count'])))
print('Number of female customers who bought bike = ' + str(newdf[mask2]['BikeBuyer'].agg(['count'])))

In [None]:
mask1 = (newdf['MaritalStatus'] == 'M') & (newdf['BikeBuyer'] == 1)
mask2 = (newdf['MaritalStatus'] == 'S') & (newdf['BikeBuyer'] == 1)

print('Married customers who bought bike   = ' + str(newdf[mask1]['BikeBuyer'].agg(['count'])))
print('Single customers who bought bike    = ' + str(newdf[mask2]['BikeBuyer'].agg(['count'])))

In [None]:
newdf.drop(['CustomerID'], axis=1, inplace=True)

In [None]:
newdf_count = newdf['BikeBuyer'].value_counts()

### Step 4: Feature Analysis

In [None]:
categoryColumns = ['CountryRegionName', 'Education', 'Occupation', 'Gender', 
                  'MaritalStatus']

numerableColumns = ['AveMonthSpend', 'Age', 'HomeOwnerFlag',
                    'NumberCarsOwned', 'NumberChildrenAtHome', 'TotalChildren', 'YearlyIncome']


In [None]:

def plotBox(newdf, cols, colX='BikeBuyer'):
    for col in cols:
        plt.figure(figsize=(18,6))
        sns.set_style("whitegrid")
        sns.boxplot(x=colX, y=col, data=newdf)
        plt.xlabel(colX) 
        plt.ylabel(col)
        plt.show()

plotBox(newdf, numerableColumns)

In [None]:
newdf['dummy'] = np.ones(shape = newdf.shape[0])

for col in categoryColumns:
    print(col)
    counts = newdf[['dummy', 'BikeBuyer', col]].groupby(['BikeBuyer', col], as_index = False).count()
    temp = counts[counts['BikeBuyer'] == 0][[col, 'dummy']]
    _ = plt.figure(figsize = (10,4))
    plt.subplot(1, 2, 1)
    plt.bar(temp[col], temp.dummy)
    plt.xticks(rotation=90)
    plt.title('Counts for ' + col + '\n No Doesnt')
    plt.ylabel('count')
    plt.subplot(1, 2, 2)
    temp = counts[counts['BikeBuyer'] == 1][[col, 'dummy']]
    plt.bar(temp[col], temp.dummy)
    plt.xticks(rotation=90)
    plt.title('Counts for ' + col + '\n Yes BikeBuyer')
    plt.ylabel('count')
    plt.show()

### Step 5: Machine Learning

In [None]:
s