#Machine Learning Model


In [None]:
# Mount Google Drive/Folder containing Dataset
from google.colab import drive
drive.mount('/content/drive')

%cd '/content/drive/MyDrive/I310D/Data_Science_Girlies'

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/MyDrive/I310D/Data_Science_Girlies


In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error

In [None]:
# Importing dataset as CSV file
dummy_df = pd.read_csv('/content/drive//MyDrive/I310D/Data_Science_Girlies/dummy_data.csv')
# Print feature values and their data type/frequency counts
dummy_df.info()

# Clean data functions
dummy_df.isnull().sum()
dummy_df.duplicated().sum()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   age           1000 non-null   int64 
 1   gender        1000 non-null   object
 2   time_spent    1000 non-null   int64 
 3   platform      1000 non-null   object
 4   interests     1000 non-null   object
 5   location      1000 non-null   object
 6   demographics  1000 non-null   object
 7   profession    1000 non-null   object
 8   income        1000 non-null   int64 
 9   indebt        1000 non-null   bool  
 10  isHomeOwner   1000 non-null   bool  
 11  Owns_Car      1000 non-null   bool  
dtypes: bool(3), int64(3), object(6)
memory usage: 73.4+ KB


0

In [None]:
# Divide 'age' and 'income' data into 3 quantiles
dummy_df['age_range'] = pd.qcut(dummy_df['age'], q=3, labels=['Young', 'Adult', 'Old'])
dummy_df['income_type'] = pd.qcut(dummy_df['income'], q=3, labels=['Poor', 'Mid', 'Rich'])

# Display the new values within dataset
dummy_df.head()

Unnamed: 0,age,gender,time_spent,platform,interests,location,demographics,profession,income,indebt,isHomeOwner,Owns_Car,age_range,income_type
0,56,male,3,Instagram,Sports,United Kingdom,Urban,Software Engineer,19774,True,False,False,Old,Rich
1,46,female,2,Facebook,Travel,United Kingdom,Urban,Student,10564,True,True,True,Adult,Poor
2,32,male,8,Instagram,Sports,Australia,Sub_Urban,Marketer Manager,13258,False,False,False,Young,Poor
3,60,non-binary,5,Instagram,Travel,United Kingdom,Urban,Student,12500,False,True,False,Old,Poor
4,25,male,1,Instagram,Lifestlye,Australia,Urban,Software Engineer,14566,False,True,True,Young,Mid


In [None]:
# Featurize function
def featurize(df):
    # explicit mappings for each categorical variable -> numerical values
    gender_mapping = {'male': 2, 'female': 1, 'non-binary': 0}
    platform_mapping = {'Instagram': 2, 'Facebook': 1, 'YouTube': 0}
    interests_mapping = {'Sports': 2, 'Travel': 1, 'Lifestlye': 0}
    location_mapping = {'United States': 2, 'United Kingdom': 1, 'Australia': 0}
    demographics_mapping = {'Urban': 2, 'Sub_Urban': 1, 'Rural': 0}
    profession_mapping = {'Software Engineer': 2, 'Student': 1, 'Marketer Manager': 0}
    age_mapping = {'Young': 2, 'Adult': 1, 'Old': 0}
    income_mapping = {'Poor': 2, 'Mid': 1, 'Rich': 0}

    # Transferring mappings to dataframe
    df['gender'] = df['gender'].map(gender_mapping)
    df['platform'] = df['platform'].map(platform_mapping)
    df['interests'] = df['interests'].map(interests_mapping)
    df['location'] = df['location'].map(location_mapping)
    df['demographics'] = df['demographics'].map(demographics_mapping)
    df['profession'] = df['profession'].map(profession_mapping)
    df['age_range'] = df['age_range'].map(age_mapping)
    df['income_type'] = df['income_type'].map(income_mapping)

    # Including boolean values direct as binary
    df['indebt'] = df['indebt'].astype(int)
    df['isHomeOwner'] = df['isHomeOwner'].astype(int)
    df['Owns_Car'] = df['Owns_Car'].astype(int)

    # Return only the encoded columns
    return df[['gender', 'platform', 'interests', 'location', 'demographics', 'profession', 'age_range', 'income_type', 'indebt', 'isHomeOwner', 'Owns_Car']]

# adjust dataset values to be numerical based
X_processed = featurize(dummy_df)

# print dataset after being processed
print('Featurized Dataset')
print(X_processed)

print('**********************************************************************')

# create a train dataset that combines this processed data w/ original time_spent feature
# for training ML model using solely numerical values
train = pd.concat([X_processed, dummy_df['time_spent']], axis=1)

# print data set after being set up for training
print('Adjusted Training Dataset')
print(train)

# drop the time_spent values from the X for training
X = train.drop('time_spent', axis=1)
# focus y val on time_spent as target feature
y = train['time_spent']

Featurized Dataset
     gender  platform  interests  location  demographics  profession  \
0         2         2          2         1             2           2   
1         1         1          1         1             2           1   
2         2         2          2         0             1           0   
3         0         2          1         1             2           1   
4         2         2          0         0             2           2   
..      ...       ...        ...       ...           ...         ...   
995       1         2          0         1             0           0   
996       0         0          1         1             0           2   
997       0         0          1         1             0           1   
998       1         0          2         0             1           0   
999       1         0          1         0             0           1   

    age_range income_type  indebt  isHomeOwner  Owns_Car  
0           0           0       1            0         0 

In [None]:
# trains ML model using new numerical vals and target val
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model = LinearRegression()
# fits the ML model
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

# mean squared error (should be lower)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
print("Root Mean Square Error:", rmse)

Root Mean Square Error: 2.5171909900087757


# Evaluating the ML Model

In [None]:
import pandas as pd

# loading in original dataset
dummy_df = pd.read_csv('/content/drive//MyDrive/I310D/Data_Science_Girlies/dummy_data.csv')

# test case data in original form (matching original dataset values)
new_data = {
    'age': [54],
    'gender': ['male'],
    'platform': ['Instagram'],
    'interests': ['Sports'],
    'location': ['United Kingdom'],
    'demographics': ['Urban'],
    'profession': ['Software Engineer'],
    'income': [19774],
    'indebt': [1],
    'isHomeOwner': [0],
    'Owns_Car': [0]
}

# new dataframe for test case
new_df = pd.DataFrame(new_data)

# append the new dataframe to the original dataframe
combined_df = pd.concat([dummy_df, new_df], ignore_index=True)

# recalculate quantiles and assign labels
# recall that the quantiles were already established previously...
# so the addition of new values will shift these quantile values AND
# we must assign a categorical value based on these new quantiles
combined_df['age_range'] = pd.qcut(combined_df['age'], 3, labels=['Young', 'Adult', 'Old'])
combined_df['income_type'] = pd.qcut(combined_df['income'], 3, labels=['Poor', 'Mid', 'Rich'])

# take the new quantiles and output the assigned values for the test case
test_case_quantiles = combined_df.loc[combined_df.index == len(combined_df) - 1, ['age_range', 'income_type']]
print(test_case_quantiles)

# transferring the new values to the dataset
new_df['age_range'] = combined_df.iloc[-1]['age_range']
new_df['income_type'] = combined_df.iloc[-1]['income_type']

print('***********************************************************************')
# print the current new dataset
print(new_df)

# drops the original features of 'age' and 'income' that were stored
# we now have 'age_range' and 'income_type'
new_df = new_df.drop(['age', 'income'], axis=1)

print('***********************************************************************')
# print the new dataset with the dropped values
print (new_df)

# featurize this new dataset to address categorical values
featurized_output = featurize(new_df)

print('***********************************************************************')
# print out new featurized dataset
print(featurized_output)

print('***********************************************************************')
print('FINAL OUTPUT')
# predict using regression the time_spent based on the test case values
predicted_screen_time = model.predict(featurized_output)
print(f"Predicted screen time: {predicted_screen_time[0]} minutes")

     age_range income_type
1000       Old        Rich
***********************************************************************
   age gender   platform interests        location demographics  \
0   54   male  Instagram    Sports  United Kingdom        Urban   

          profession  income  indebt  isHomeOwner  Owns_Car age_range  \
0  Software Engineer   19774       1            0         0       Old   

  income_type  
0        Rich  
***********************************************************************
  gender   platform interests        location demographics         profession  \
0   male  Instagram    Sports  United Kingdom        Urban  Software Engineer   

   indebt  isHomeOwner  Owns_Car age_range income_type  
0       1            0         0       Old        Rich  
***********************************************************************
   gender  platform  interests  location  demographics  profession  age_range  \
0       2         2          2         1             2    

#ML Model Input Demo

In [None]:
# UI Fields
age = 73 # @param {type:"number"}
gender = 'female' # @param ["male", "female", "non-binary"]
platform = 'Facebook' # @param ["Instagram", "Facebook", "YouTube"]
interests = "Travel" # @param ["Sports", "Travel", "Lifestyle"]
location = 'Australia' # @param ["United States", "United Kingdom", "Australia"]
demographics = "Sub_Urban" # @param ["Urban", "Sub_Urban", "Rural"]
profession = "Student" # @param ["Software Engineer", "Student", "Marketer Manager"]
income = 100000 # @param {type:"number"}
inDebt = True # @param ["False", "True"] {type:"raw"}
isHomeOwner = False # @param ["False", "True"] {type:"raw"}
Owns_Car = True # @param ["False", "True"] {type:"raw"}

new_data = {
    'age': [age],
    'gender': [gender],
    'platform': [platform],
    'interests': [interests],
    'location': [location],
    'demographics': [demographics],
    'profession': [profession],
    'income': [income],
    'indebt': [inDebt],
    'isHomeOwner': [isHomeOwner],
    'Owns_Car': [Owns_Car]
}

# new dataframe for test case
new_df = pd.DataFrame(new_data)

# append the new dataframe to the original dataframe
combined_df = pd.concat([dummy_df, new_df], ignore_index=True)

# recalculate quantiles and assign labels
# recall that the quantiles were already established previously...
# so the addition of new values will shift these quantile values AND
# we must assign a categorical value based on these new quantiles
combined_df['age_range'] = pd.qcut(combined_df['age'], 3, labels=['Young', 'Adult', 'Old'])
combined_df['income_type'] = pd.qcut(combined_df['income'], 3, labels=['Poor', 'Mid', 'Rich'])

# take the new quantiles and output the assigned values for the test case
test_case_quantiles = combined_df.loc[combined_df.index == len(combined_df) - 1, ['age_range', 'income_type']]
print(test_case_quantiles)

# transferring the new values to the dataset
new_df['age_range'] = combined_df.iloc[-1]['age_range']
new_df['income_type'] = combined_df.iloc[-1]['income_type']

print('***********************************************************************')
# print the current new dataset
print(new_df)

# drops the original features of 'age' and 'income' that were stored
# we now have 'age_range' and 'income_type'
new_df = new_df.drop(['age', 'income'], axis=1)

print('***********************************************************************')
# print the new dataset with the dropped values
print (new_df)

# featurize this new dataset to address categorical values
featurized_output = featurize(new_df)

print('***********************************************************************')
# print out new featurized dataset
print(featurized_output)

print('***********************************************************************')
print('FINAL OUTPUT')
# predict using regression the time_spent based on the test case values
predicted_screen_time = model.predict(featurized_output)
print(f"Predicted screen time: {predicted_screen_time[0]} hours")


     age_range income_type
1000       Old        Rich
***********************************************************************
   age  gender  platform interests   location demographics profession  income  \
0   73  female  Facebook    Travel  Australia    Sub_Urban    Student  100000   

   indebt  isHomeOwner  Owns_Car age_range income_type  
0    True        False      True       Old        Rich  
***********************************************************************
   gender  platform interests   location demographics profession  indebt  \
0  female  Facebook    Travel  Australia    Sub_Urban    Student    True   

   isHomeOwner  Owns_Car age_range income_type  
0        False      True       Old        Rich  
***********************************************************************
   gender  platform  interests  location  demographics  profession  age_range  \
0       1         1          1         0             1           1          0   

   income_type  indebt  isHomeOwner  Ow