In [1]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import pandas as pd
from sklearn.ensemble import RandomForestClassifier

In [2]:
df = pd.read_csv("data.csv").dropna()

In [3]:
# enumerating the country column
print(df['country'])
country_mapping = {country: i for i, country in enumerate(df['country'].unique())}
df['country'] = df['country'].map(country_mapping)
print(df['country'])

0                mali
1                iran
2             czechia
3               haiti
4        saudi arabia
             ...     
93451    turkmenistan
93452      cabo verde
93453     philippines
93454         ukraine
93455            laos
Name: country, Length: 93456, dtype: object
0          0
1          1
2          2
3          3
4          4
        ... 
93451     48
93452    165
93453    111
93454     91
93455     77
Name: country, Length: 93456, dtype: int64


In [4]:
# enumerating the name column
print(df['name'])
name_mapping = {name: i for i, name in enumerate(df['name'].unique())}
df['name'] = df['name'].map(name_mapping)
print(df['name'])

0        4861.html
1        1691.html
2        6103.html
3        2321.html
4        2836.html
           ...    
93451    2282.html
93452    6409.html
93453    4712.html
93454    5798.html
93455    8387.html
Name: name, Length: 93456, dtype: object
0           0
1           1
2           2
3           3
4           4
         ... 
93451    2631
93452    9876
93453    2464
93454    5774
93455    8094
Name: name, Length: 93456, dtype: int64


In [5]:
# enumerating the time column
print(df['time'])
name_mapping = {time: i for i, time in enumerate(df['time'].unique())}
df['time'] = df['time'].map(name_mapping)
print(df['time'])

0        2023-11-02 16:00:00
1        2023-11-02 08:00:00
2        2023-11-02 11:00:00
3        2023-11-02 06:00:00
4        2023-11-02 12:00:00
                ...         
93451    2023-11-03 13:00:00
93452    2023-11-03 16:00:00
93453    2023-11-03 23:00:00
93454    2023-11-03 05:00:00
93455    2023-11-03 11:00:00
Name: time, Length: 93456, dtype: object
0         0
1         1
2         2
3         3
4         4
         ..
93451    45
93452    42
93453    31
93454    43
93455    47
Name: time, Length: 93456, dtype: int64


In [6]:
# enumerating the income column
print(df['income'])
name_mapping = {income: i for i, income in enumerate(df['income'].unique())}
df['income'] = df['income'].map(name_mapping)
print(df['income'])

0          40k-60k
1            0-10k
2          20k-40k
3          20k-40k
4        150k-250k
           ...    
93451      20k-40k
93452      20k-40k
93453    150k-250k
93454      10k-20k
93455      40k-60k
Name: income, Length: 93456, dtype: object
0        0
1        1
2        2
3        2
4        3
        ..
93451    2
93452    2
93453    3
93454    4
93455    0
Name: income, Length: 93456, dtype: int64


In [7]:
# feature extraction of ip_address
print(df.columns)
df[['ip_1', 'ip_2', 'ip_3', 'ip_4']] = df['ip_address'].str.split('.', expand=True).astype(int)
df = df.drop('ip_address', axis=1)
print(df.columns)

Index(['request_id', 'gender', 'age', 'income', 'time', 'ip_address', 'name',
       'country', 'is_banned'],
      dtype='object')
Index(['request_id', 'gender', 'age', 'income', 'time', 'name', 'country',
       'is_banned', 'ip_1', 'ip_2', 'ip_3', 'ip_4'],
      dtype='object')


In [8]:
# one hot encoding remaining categorical columns
data_columns_categorical = list(df.dtypes[df.dtypes == 'object'].index)
print(data_columns_categorical)

data_cleaned = pd.get_dummies(df, columns=data_columns_categorical)
print(data_cleaned.columns)

['gender', 'age']
Index(['request_id', 'income', 'time', 'name', 'country', 'is_banned', 'ip_1',
       'ip_2', 'ip_3', 'ip_4', 'gender_Female', 'gender_Male', 'age_0-16',
       'age_17-25', 'age_26-35', 'age_36-45', 'age_46-55', 'age_56-65',
       'age_66-75', 'age_76+'],
      dtype='object')


In [9]:
print(data_cleaned.head())

   request_id  income  time  name  country  is_banned  ip_1  ip_2  ip_3  ip_4  \
0           1       0     0     0        0          0    71   169   239   175   
1           2       1     1     1        1          1    10   108   130   210   
2           3       2     2     2        2          0    74    67   246   113   
3           4       2     3     3        3          0    29    96   232     2   
4           5       3     4     4        4          0   161    88    37    41   

   gender_Female  gender_Male  age_0-16  age_17-25  age_26-35  age_36-45  \
0           True        False     False      False       True      False   
1          False         True     False      False       True      False   
2           True        False     False      False      False      False   
3          False         True     False      False      False      False   
4          False         True     False      False      False      False   

   age_46-55  age_56-65  age_66-75  age_76+  
0      Fal

In [10]:
# model 1
X = data_cleaned[['ip_1', 'ip_2', 'ip_3', 'ip_4']]
y = data_cleaned['country']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

model = RandomForestClassifier(n_estimators=100, random_state=0)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

print("Accuracy with Best Model:", accuracy)

Accuracy with Best Model: 0.9843421193422977


In [11]:
# model 2
y = data_cleaned['income']
X = data_cleaned.drop(['income'], axis=1)
print(X.columns)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

model = RandomForestClassifier(n_estimators=100, random_state=0)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

print("Accuracy with Best Model:", accuracy)

Index(['request_id', 'time', 'name', 'country', 'is_banned', 'ip_1', 'ip_2',
       'ip_3', 'ip_4', 'gender_Female', 'gender_Male', 'age_0-16', 'age_17-25',
       'age_26-35', 'age_36-45', 'age_46-55', 'age_56-65', 'age_66-75',
       'age_76+'],
      dtype='object')
Accuracy with Best Model: 0.12529871241573634
