# Data Modeling Sandbox

In [13]:
import pandas as pd
import numpy as np
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

# read parquet files into dataframes
offer_df = pd.read_parquet('data/offer_df.parquet')
offer_complete_df = pd.read_parquet('data/offer_complete_df.parquet')
transaction_df = pd.read_parquet('data/transaction_df.parquet')

In [14]:
offer_complete_df.sample(n=5)

Unnamed: 0,user_id,offer_id,time,age,income,days_as_member,F,M,O,reward,difficulty,duration,email,mobile,social,web,offer_completed,offer_viewed
19018,6de6f7e081af455886287068f1b40419,fafdcd668e3743c1bb461111dcafc2a4,2778,125.0,335000.0,9450.0,5.0,0.0,0.0,10.0,50.0,50.0,5.0,5.0,5.0,5.0,1,2
3283,1359818aa5bc4becaad9c18df5dbf776,ae264e3637204a6fb9bb56bc8210ddfd,1758,156.0,297000.0,5700.0,3.0,0.0,0.0,30.0,30.0,21.0,3.0,3.0,3.0,0.0,1,1
37871,db564686d31c48e4b35a4ee0ce50f824,2298d6c36e964ae4a3e7e9706d1fb8c2,822,126.0,112000.0,3592.0,0.0,2.0,0.0,6.0,14.0,14.0,2.0,2.0,2.0,2.0,0,1
28430,a2f9b5fcc34649cf82d76d221024a644,0b1e1539f2cc45b7b9fa7c272da2e1d7,1290,174.0,174000.0,5721.0,3.0,0.0,0.0,15.0,60.0,30.0,3.0,0.0,0.0,3.0,1,1
43616,fca0cfd25f0f4d489137d3e4bd863e20,2906b810c7d4411798c6938adc9daaa5,2244,170.0,230000.0,12090.0,5.0,0.0,0.0,10.0,50.0,35.0,5.0,5.0,0.0,5.0,2,1


In [18]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import RFE
from sklearn.metrics import accuracy_score
from xgboost import XGBRegressor
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

# Load the dataset
data = offer_complete_df

# drop the user_id, offer_id, time columns
# and all non numeric columns
data = data.drop(['user_id', 'offer_id', 'time'], axis=1)

# drop all non numeric columns
data = data.select_dtypes(include=['float64', 'int64'])

# make 'offer_completed' the last column
targetcol = data.pop('offer_completed')
data['offer_completed'] = targetcol

# Split the dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(data.iloc[:,:-1], data.iloc[:,-1], test_size=0.2, random_state=42)

# # Scale the data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Initialize the XGBRegressor model
xgb = XGBRegressor()

# Perform Recursive Feature Elimination (RFE)
rfe = RFE(estimator=xgb, n_features_to_select=10, step=1)
rfe.fit(X_train, y_train)

# Select the important features from the data
X_train_selected = rfe.transform(X_train)
X_test_selected = rfe.transform(X_test)

# Print the selected features
selected_features = X_train.columns[rfe.support_]
# info.logger.info("Selected features: ", list(selected_features))
print("Selected features: ", list(selected_features))

# Train the XGBRegressor model with selected features
xgb_selected = XGBRegressor()
xgb_selected.fit(X_train_selected, y_train)

# Make predictions on the test set using the model with selected features
y_pred = xgb_selected.predict(X_test_selected)

# Calculate the accuracy score
r2 = r2_score(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, y_pred)
print('XGBRegressor R^2:', r2)
print('XGBRegressor MSE:', mse)
print('XGBRegressor RMSE:', rmse)
print('XGBRegressor MAE:', mae)

# Print the feature importance
importance = xgb_selected.feature_importances_
for i,v in enumerate(importance):
    print('Feature: %s, Score: %.5f' % (selected_features[i],v))

KeyError: 'offer_completed'

In [19]:
# write feature importance to a dataframe
feature_importance = pd.DataFrame({'feature': selected_features, 'importance': importance})
feature_importance = feature_importance.sort_values(by='importance', ascending=False)
feature_importance

NameError: name 'selected_features' is not defined

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import RFE
from sklearn.metrics import accuracy_score
from xgboost import XGBRegressor
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.feature_selection import SelectFromModel

# Load the dataset
data = offer_complete_df

# drop the user_id, offer_id, time columns
# and all non numeric columns
data = data.drop(['user_id', 'offer_id', 'time'], axis=1)

# drop all non numeric columns
data = data.select_dtypes(include=['float64', 'int64'])

# make 'offer_completed' the last column
targetcol = data.pop('offer_completed')
data['offer_completed'] = targetcol

# Split the dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(data.iloc[:,:-1], data.iloc[:,-1], test_size=0.2, random_state=42)

# # Scale the data
# scaler = StandardScaler()
# X_train = scaler.fit_transform(X_train)
# X_test = scaler.transform(X_test)

# Initialize the XGBClassifier model
xgb = XGBRegressor()

# Train the XGBClassifier model
xgb.fit(X_train, y_train)

# Feature selection using XGBClassifier
selection = SelectFromModel(xgb, threshold='median')
selection.fit(X_train, y_train)

# Print the indices of the selected features
selected_features = selection.get_support(indices=True)
# print('Selected features:', selected_features)
# get the column names of the selected features
selected_features = X_train.columns[selected_features]
print('Selected features:')
print('\n'.join(list(selected_features)))

# Select the important features from the data
X_train_selected = selection.transform(X_train)
X_test_selected = selection.transform(X_test)


# Train the XGBRegressor model with selected features
xgb_selected = XGBRegressor()
xgb_selected.fit(X_train_selected, y_train)

# Make predictions on the test set using the model with selected features
y_pred = xgb_selected.predict(X_test_selected)

# Calculate the accuracy score

r2 = r2_score(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
print('XGBRegressor R^2:', r2)
print('XGBRegressor MSE:', mse)
print('XGBRegressor MAE:', mae)

print(len(selected_features))
print(len(xgb.feature_importances_))

# Print the feature importance
importance = xgb_selected.feature_importances_
for i,v in enumerate(importance):
    print('Feature: %s, Score: %.5f' % (selected_features[i],v))

In [None]:
# write feature importance to a dataframe
feature_importance = pd.DataFrame({'feature': selected_features, 'importance': importance})
feature_importance = feature_importance.sort_values(by='importance', ascending=False)
feature_importance

In [None]:
import matplotlib.pyplot as plt
# plot the feature importance
plt.bar([x for x in range(len(importance))], importance)