In [None]:
import pandas as pd
import numpy as np
from sklearn.linear_model import SGDClassifier
import matplotlib.pyplot as plt
import seaborn as sns
import datetime
from sklearn.model_selection import cross_val_score, StratifiedKFold, GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split


In [None]:
train_purch = pd.read_csv("/kaggle/input/purchase-prediction/train_purch.csv")
products = pd.read_csv("/kaggle/input/purchase-prediction/products.csv")
clients = pd.read_csv("/kaggle/input/purchase-prediction/clients2.csv")
train = pd.read_csv("/kaggle/input/purchase-prediction/train.csv")


In [None]:
train_purch["transaction_datetime"] = pd.to_datetime(train_purch["transaction_datetime"])
clients[['first_issue_date','first_redeem_date']] = clients[['first_issue_date','first_redeem_date']].apply(pd.to_datetime)

## Дату перевёл в pd.to_datetime

In [None]:
#   убрал выбросы age
clients[(clients["age"] < 0) | (clients["age"] > 100)] = clients.age.median()                                              
#   дропнул client_id.1
clients.drop(columns=["client_id.1", "first_issue_date", "first_redeem_date"], inplace=True)   
#   дропнул level_1,2,3,4
products.drop(columns=["level_1", "level_2", "level_3", "level_4"], inplace=True) 

## Дропнул client_id.1, levels и убрал выбросы age, дату первого просмотра и первого выкупа заменил их дельтой(timedelta_purchase)

In [None]:
clients_train = pd.merge(clients, train, on="client_id")                     #   соеденил clients с train в clients_train
clients_train_purch = pd.merge(clients_train, train_purch, on="client_id")   #   соеденил clients_train с train_purch в clients_train_purch
all_feat = pd.merge(clients_train_purch, products, on="product_id")          #   соеженил clients_train_purch с products в all_feat
all_feat.drop(columns="transaction_id", inplace=True)

## Соединил датафреймы clients с train, с train_purch, с products 

In [None]:
def active_user(df):
    
    rating = df[["transaction_datetime", "purchased", "client_id"]]                                               
    time_max = rating["transaction_datetime"].max()
    time_delta = time_max - datetime.timedelta(days=60)
    data = rating.sort_values(by="transaction_datetime",ascending=False)
    data.set_index("transaction_datetime", inplace=True)
    period_data = data[time_max : time_delta]
    period_data.reset_index("transaction_datetime", inplace=True)
    period_data.set_index("client_id", inplace=True)
    count_purchase = period_data.groupby("client_id").agg({"purchased": "sum"})
    count_purchase["active_user"] = 0
    rat = count_purchase["purchased"]
    
    percent_25 = count_purchase.quantile(0.25).values[0]                          # 25% 
    percent_50 = count_purchase.quantile(0.5).values[0]                           # 50% 
    percent_75 = count_purchase.quantile(0.75).values[0]                          # 75% 
    
    count_purchase.loc[(percent_50 > rat) & (rat >= percent_25), "active_user"] = 1    # присваивание 1 уровня рейтинга 
    count_purchase.loc[(percent_75 > rat) & (rat >= percent_50), "active_user"] = 2    # присваивание 2 уровня рейтинга
    count_purchase.loc[percent_75 <= rat, "active_user"] = 3                           # присваивание 3 уровня рейтинга
    count_purchase.drop(columns="purchased", inplace=True)                             # дропнул лишний параметр
    
    df = pd.merge(df, count_purchase, on="client_id") 
    return df

In [None]:


def ratings(df: pd.DataFrame) -> pd.DataFrame:
    df_for_test = []
    drop_feat = []
    for col in df.columns:

        if col.endswith("_id") and "transaction_id" != col:
            
            
            rating = df[[col, "purchased"]]                                               
            rating = rating.groupby(col).agg("sum")
            rating[col + "_rating"] = 0                                              # создание признака, показывающего рейтинг 

            percent_25 = rating.quantile(q=0.25).values[0]                           # 25% 
            percent_50 = rating.quantile(q=0.5).values[0]                            # 50% 
            percent_75 = rating.quantile(q=0.75).values[0]                           # 75% 

            rating.loc[(percent_50 > rating["purchased"]) & (rating["purchased"] >= percent_25), col + "_rating"] = 1    # присваивание 1 уровня рейтинга 
            rating.loc[(percent_75 > rating["purchased"]) & (rating["purchased"] >= percent_50), col + "_rating"] = 2    # присваивание 2 уровня рейтинга
            rating.loc[percent_75 <= rating["purchased"], col + "_rating"] = 3                                           # присваивание 3 уровня рейтинга
            
            df_for_test.append(rating.drop(columns="purchased").reset_index().columns[0])
            df_for_test.append(rating.drop(columns="purchased").reset_index().columns[1])
            drop_feat.append(col)
            
            rating.drop(columns="purchased", inplace=True)                                                               # дропнул лишний параметр
            df = pd.merge(df, rating, on=col)                                                                            # присоеденил признак рейтинга
            
    df = active_user(df)
    return df, df_for_test


In [None]:
rating,df_for_test = ratings(all_feat)

In [None]:
rating = active_user(rating)

In [None]:
rating[df_for_test]

In [None]:
rating.set_index("client_id", inplace=True)
rating.drop(columns="transaction_datetime", inplace=True)
rating["gender"] = LabelEncoder().fit_transform(rating["gender"])

In [None]:
rating["target"] = 0
rating.loc[((rating["purchased"] == 1) & (rating["treatment_flg"] == 1)) | ((rating["purchased"] == 0) & (rating["treatment_flg"] == 0)), "target"] = 1

In [None]:
# test_data = pd.read_csv("/kaggle/input/purchase-prediction/test_purch.csv")
# test_data.head()

In [None]:
test_data = pd.merge(test_data, clients, on="client_id")
test_data = pd.merge(test_data, products, on="product_id")

In [None]:
test_data.client_id.nunique()

In [None]:
test_data

In [None]:
clients

In [None]:
len(set(products["product_id"]) & set(test_data["product_id"]))

In [None]:
len(set(products["product_id"]) & set(test_data["product_id"]))

In [None]:
len(set(clients["client_id"]) & set(test_data["client_id"]))

In [None]:
# Разделяем на признаки и объекты
X = rating.drop(columns="target")
y = rating["target"]
y = y.astype(int)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=17)
# Инициализируем наш классификатор с дефолтными параметрами
rfc = RandomForestClassifier(n_jobs=-1)

# Обучаем на тренировочном датасете
print("тут")
X_train = X_train.fillna(0)
results = cross_val_score(rfc, X_train, y_train, cv=5)
print("теперь тут")

# Оцениваем долю верных ответов на тестовом датасете
print("CV accuracy score: {:.2f}%".format(results.mean()*100))

In [None]:
rating.shape

In [None]:
X

In [None]:
X_train = X_train.fillna(0)
print("The number of nans in x_train: ", np.isnan(X_train).sum())