# Import Libraries / Load Data

In [None]:
import numpy as np 
import pandas as pd 
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, plot_roc_curve
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import plot_importance
import xgboost as xg
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")
sns.set_theme(style =  "whitegrid")

In [None]:
PATH = "items.csv"
data = pd.read_csv(PATH)

# Data Analysis

In [None]:
data.shape

In [None]:
data.head()

## Target Analysis

In [None]:
def plotTargetCounts(data,
                     target):
    ###
    ### Plots value counts of target feature as bar plot for better analysis
    ### @params = {data: dataset to examine, target: target feature for plotting}
    ###
    target = data[target].value_counts()
    plt.figure(figsize = (12,8))
    sns.barplot(x = target.keys(),
                y = target.values)
    plt.show()

plotTargetCounts(data, "is_blog")

In [None]:
data = data[data["is_blog"] != "is_blog"]
data["is_blog"] = data["is_blog"].astype(int)
plotTargetCounts(data, "is_blog")

In [None]:
data["is_blog"].value_counts()

In [None]:
data.drop(data[data["is_blog"] == 1].index.tolist()[-42000:],
         axis = 0,
         inplace = True)

In [None]:
plotTargetCounts(data, "is_blog")

In [None]:
data["is_blog"].value_counts()

## Missing Values

In [None]:
data.isna().sum()

In [None]:
pd.DataFrame(data.isna().sum() ,
             columns=["naCount"]).\
             reset_index().\
             rename(columns = {"index" : "columnName" }).\
             sort_values(by = "naCount")

## Outlier Analysis

In [None]:
colsForOutlierAnalyze = [col for col in data.columns if len(data[col].value_counts().keys()) > 100]
colsForOutlierAnalyze

In [None]:
def boxPlot(data, colsForOutlierAnalyze):
## 
## Boxplots for detect outlier values
## @params = {data: dataset for analyze, colsForOutlierAnalyze: Numeric columns in dataset}
##
    plt.subplots(nrows = 3,
              ncols = 1,
              figsize = (15,15))
    for i in range(len(colsForOutlierAnalyze)):
        plt.subplot(3,1,i+1)
        sns.boxplot(x = "is_blog", y = colsForOutlierAnalyze[i], data = data)
  #plt.title(colsForOutlierAnalyze[i])
boxPlot(data, colsForOutlierAnalyze)

## Dtypes

In [None]:
data.dtypes

In [None]:
for column in data.columns[1:]:
    data[column] = data[column].astype(int)

In [None]:
data = data.sample(frac=1).reset_index(drop=True) ## Need to shuffle dataset

# Train-Test Split

In [None]:
def trainTestSplitData(data, trainLen, valLen):
##
## Splits data for train (Model Training) and test (Model Evaluating)
## @params = {data: data for split, testLen: train data ratio}
## @returns = {x_train, x_test, y_train, y_test, x_val, y_val : base and target data for model}
##
    X = data.drop(["is_blog", "url"], 
                  axis = 1)
    y = data["is_blog"]
    urls = data["url"]
    trainLen = int(data.shape[0] * trainLen)
    valLen = trainLen - int(trainLen * valLen)  
    x_train, x_test, y_train, y_test, test_urls = X[:trainLen], X[trainLen:], y[:trainLen], y[trainLen:], urls[trainLen:]
    x_train, x_val, y_train, y_val = x_train[:valLen], x_train[valLen:], y_train[:valLen], y_train[valLen:]
    del X
    del y
    return x_train, x_test, x_val, y_train, y_test, y_val, test_urls

In [None]:
x_train, x_test, x_val, y_train, y_test, y_val, test_urls = trainTestSplitData(data, 0.8, 0.15)

In [None]:
print(f"Train Data Shape: {x_train.shape} Train Target Shape: {y_train.shape}")
print(f"Validation Data Shape: {x_val.shape} Validation Target Shape: {y_val.shape}")
print(f"Test Data Shape: {x_test.shape} Test Target Shape: {y_test.shape}")

# Model Selection

In [None]:
def gridSearchCrossValidation(tunedParams,
                              scores,
                              x_train,
                              x_val,
                              y_train,
                              y_val,
                              modelType = "KNN"
                              ):
    ###
    ### Cross Validation for test model metric and hyper-parameter tuning
    ### @params = {tunedParams: hyper-parameters for dataset, scores: accuracy metrics, x_train/y_train/x_val/y_val: datasets, modelType: algorithm type (KNN, RF, XGB)}
    ###
    for score in scores:
        print(f"Hyper-Parameter Tuning for {score}")
        if modelType == "KNN":
            model = GridSearchCV(KNeighborsClassifier(),
                       tunedParams,
                       scoring = f"{score}_macro")
        elif modelType == "RF":
            model = GridSearchCV((RandomForestClassifier()),
                       tunedParams,
                       scoring = f"{score}_macro")
        elif modelType == "XGB":
            model = GridSearchCV(xg.XGBClassifier(),
                       tunedParams,
                       scoring = f"{score}_macro")
        
        model.fit(x_train, y_train)
        print("Best parameters set found on development set:")
        print(model.best_params_)
        print("Grid scores on development set:")
        means = model.cv_results_['mean_test_score']
        stds = model.cv_results_['std_test_score']
        for mean, std, params in zip(means, stds, model.cv_results_['params']):
            print("%0.3f (+/-%0.03f) for %r"
                  % (mean, std * 2, params))
        print("Detailed classification report:")
        y_true, y_pred = y_val, model.predict(x_val)
        print(classification_report(y_true, y_pred))

In [None]:
tunedParams = [{"n_neighbors": [3,5,7],
                "weights": ["uniform", "distance"],
                "p": [1,2]}]
scores = ["precision", "recall"]
gridSearchCrossValidation(tunedParams,
                          scores,
                          x_train,
                          x_val,
                          y_train,
                          y_val,
                          modelType = "KNN")

In [None]:
tunedParams = [{"n_estimators": [10, 50, 100],
                "criterion": ["gini", "entropy"],
                "max_features": ["auto", "sqrt", "log2"]}]
scores = ["precision", "recall"]
gridSearchCrossValidation(tunedParams,
                          scores,
                          x_train,
                          x_val,
                          y_train,
                          y_val,
                          modelType = "RF")

In [None]:
tunedParams = [{"min_child_weight": [5, 10],
                "colsample_bytree": [0.6, 0.8]}]
scores = ["precision", "recall"]
gridSearchCrossValidation(tunedParams,
                          scores,
                          x_train,
                          x_val,
                          y_train,
                          y_val,
                          modelType = "XGB")

# Model Evaluation

In [None]:
model = xg.XGBClassifier(colsample_bytree =  0.6, 
                         min_child_weight = 5)
model.fit(x_train, y_train)

In [None]:
plt.figure(figsize= (20,20))
plot_roc_curve(model, x_test, y_test)
plt.title("Roc Curve")
plt.show()

In [None]:
plot_importance(model, max_num_features=10) # top 10 most important features
plt.show()

In [None]:
predicts = model.predict(x_test)
predictionData = {"Index": test_urls,
                      "Actual": y_test,
                      "Prediction": predicts}
predictionData = pd.DataFrame(predictionData)
predictionData

In [None]:
def getClassAccuracies(predictionData):
    ###
    ### Gets accuracies of model for all unique classes
    ### @params = {predictionaData: Preproccessed data cols = ["Index", "Actual", "Prediction"]}
    ###
    positives = predictionData[predictionData["Actual"] == 1]
    posRatio = ((positives["Actual"] == positives["Prediction"]).sum() / positives.shape[0]) * 100
    negatives = predictionData[predictionData["Actual"] == 0]
    negRatio = ((negatives["Actual"] == negatives["Prediction"]).sum() / negatives.shape[0]) * 100
    print(f"Model Accuracy for predict blogs: {posRatio}\nModel Accuracy for predict non-blogs: {negRatio}")
getClassAccuracies(predictionData)