# Logistic Regression Model for PL-S5E6

Feature Engineering idea : [EDA Notebook](https://www.kaggle.com/code/suhyukchoi/pl-s5e6-eda-notebook)

# Setup for Training

## Loading Libraries

In [None]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler, LabelEncoder
from itertools import combinations
import gc

## Loading Datasets

In [None]:
train = pd.read_csv("data/train.csv", index_col = 0)
test = pd.read_csv("data/test.csv", index_col = 0)
org_train = pd.read_csv("data/Fertilizer Prediction.csv")

train = pd.concat([train, org_train], axis=0, ignore_index=True)

train.head()


Unnamed: 0,Temparature,Humidity,Moisture,Soil Type,Crop Type,Nitrogen,Potassium,Phosphorous,Fertilizer Name
0,37,70,36,Clayey,Sugarcane,36,4,5,28-28
1,27,69,65,Sandy,Millets,30,6,18,28-28
2,29,63,32,Sandy,Millets,24,12,16,17-17-17
3,35,62,54,Sandy,Barley,39,12,4,10-26-26
4,35,58,43,Red,Paddy,37,2,16,DAP


## Check Dataset

### Train Dataset

In [None]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 850000 entries, 0 to 849999
Data columns (total 9 columns):
 #   Column           Non-Null Count   Dtype 
---  ------           --------------   ----- 
 0   Temparature      850000 non-null  int64 
 1   Humidity         850000 non-null  int64 
 2   Moisture         850000 non-null  int64 
 3   Soil Type        850000 non-null  object
 4   Crop Type        850000 non-null  object
 5   Nitrogen         850000 non-null  int64 
 6   Potassium        850000 non-null  int64 
 7   Phosphorous      850000 non-null  int64 
 8   Fertilizer Name  850000 non-null  object
dtypes: int64(6), object(3)
memory usage: 58.4+ MB


In [None]:
train.describe()

Unnamed: 0,Temparature,Humidity,Moisture,Nitrogen,Potassium,Phosphorous
count,850000.0,850000.0,850000.0,850000.0,850000.0,850000.0
mean,31.503534,61.032665,45.162887,23.081215,9.477581,21.066198
std,4.024909,6.648149,11.799929,11.219842,5.765965,12.352064
min,25.0,50.0,25.0,4.0,0.0,0.0
25%,28.0,55.0,35.0,13.0,4.0,10.0
50%,32.0,61.0,45.0,23.0,9.0,21.0
75%,35.0,67.0,55.0,33.0,14.0,32.0
max,38.0,72.0,65.0,42.0,19.0,42.0


In [None]:
test.info()

<class 'pandas.core.frame.DataFrame'>
Index: 250000 entries, 750000 to 999999
Data columns (total 8 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   Temparature  250000 non-null  int64 
 1   Humidity     250000 non-null  int64 
 2   Moisture     250000 non-null  int64 
 3   Soil Type    250000 non-null  object
 4   Crop Type    250000 non-null  object
 5   Nitrogen     250000 non-null  int64 
 6   Potassium    250000 non-null  int64 
 7   Phosphorous  250000 non-null  int64 
dtypes: int64(6), object(2)
memory usage: 17.2+ MB


In [None]:
test.describe()

Unnamed: 0,Temparature,Humidity,Moisture,Nitrogen,Potassium,Phosphorous
count,250000.0,250000.0,250000.0,250000.0,250000.0,250000.0
mean,31.491648,61.04512,45.190444,23.139612,9.487764,21.12206
std,4.024093,6.636372,11.793167,11.215956,5.76686,12.38087
min,25.0,50.0,25.0,4.0,0.0,0.0
25%,28.0,55.0,35.0,13.0,4.0,10.0
50%,31.0,61.0,45.0,23.0,10.0,21.0
75%,35.0,67.0,55.0,33.0,14.0,32.0
max,38.0,72.0,65.0,42.0,19.0,42.0


- No Missing Values. Don't need to impute.

## Define Useful Methods/Variables

### Variables

In [None]:
TARGET = 'Fertilizer Name'
COLUMNS = train.columns.tolist()
QUAN_COLUMNS = [col for col in train.columns if train[col].dtype != 'object' and col != TARGET]
CAT_COLUMNS = [col for col in train.columns if train[col].dtype == 'object' and col != TARGET]

print("Total Columns:" ,COLUMNS)
print('Target column:', TARGET)
print('Quantitative columns:', QUAN_COLUMNS)
print('Categorical columns:', CAT_COLUMNS)

Total Columns: ['Temparature', 'Humidity', 'Moisture', 'Soil Type', 'Crop Type', 'Nitrogen', 'Potassium', 'Phosphorous', 'Fertilizer Name']
Target column: Fertilizer Name
Quantitative columns: ['Temparature', 'Humidity', 'Moisture', 'Nitrogen', 'Potassium', 'Phosphorous']
Categorical columns: ['Soil Type', 'Crop Type']


### Methods

In [None]:
def FE_for_gbdt(train, test):
    dfs = [train, test]
    
    # 1. Categorical Columns Encoding.
    
    for df in dfs:
        new_CAT_COLUMNS = set() # For Uniqueness.
        '''
        1.  Add New Cateogical Column Here!
            You need to add new categorical columns into new_CAT_COLUMNS list to successfully conduct Interaction terms.
        '''
        # ========================================
        # Add Categorical "Temperature" Feature.
        
        df["BIN_Temparature"] = df["Temperature"].astype(int).astype(str)
        new_CAT_COLUMNS.add("BIN_Temparature")

        # =========================================
        '''
        2.  Add 2-level Interaction term between Categorical Columns.
            Automatically conducted.
        '''

        comb = list(combinations(CAT_COLUMNS + list(new_CAT_COLUMNS), 2))
        
        for c1, c2 in comb:
            new_col = f"{c1}_{c2}"
            df[new_col] = df[c1].astype(str) + "_" + df[c2].astype(str)
            new_CAT_COLUMNS.add(new_col)

    # Update CAT_COLUMNS.
    CAT_COLUMNS.extend(list(new_CAT_COLUMNS))
    
    # OneHotEncoding Cateogorical Columns.
    train = pd.get_dummies(train, columns = CAT_COLUMNS, dtype = int)
    test = pd.get_dummies(test, columns = CAT_COLUMNS, dtype = int)

    # 2. Quantitative Columns Encoding.
    for df in dfs:
        new_QUAN_COLUMNS = set()  # For Uniqueness.
        new_RATIO_COLUMNS = set()  # To avoid duplicate columns in interaction feature step (ex : (c1/c2) * c2 == c1), we need to specify ratio columns.
        
        '''
        1.  Add New Quantitative Column Here!
            You need to add new quantitative columns into new_QUAN_COLUMNS list to successfully conduct Interaction terms.
        '''
        # ========================================
        # Add Total_Nutrients Feature.
        df["Total_Nutrients"] = df["Nitrogen"] + df["Phosphorous"] + df["Potassium"]
        new_QUAN_COLUMNS.add("Total_Nutrients")
        
        # Add Nutrients Ratio Feature.
        comb = list(combinations(["Nitrogen", "Phosphorous", "Potassium"], 2))
        
        for c1, c2 in comb:
            new_col = f"{c1}/{c2}"
            k = df[c2].mean() # Smoothing Factor.
            df[new_col] = df[c1] / (df[c2] + k)  # Avoid division by zero.
            new_QUAN_COLUMNS.add(new_col)
            new_RATIO_COLUMNS.add(new_col)
        
        # Add Humidity to Moisture Ratio Feature.
        k = df['Moisture'].mean() # smoothing factor to 1. avoid division by zero 2. to avoid too extreme values
        df['Humidity/Moisture'] = df['Humidity'] / (df['Moisture'] + k)  # Avoid division by zero
        new_QUAN_COLUMNS.add("Humidity/Moisture")
        new_RATIO_COLUMNS.add("Humidity/Moisture")
        
        # =========================================
        '''
        2.  Add 2-level Interaction term between Quantitative Columns.
            Automatically conducted.
        '''
        comb = list(combinations(QUAN_COLUMNS + list(new_QUAN_COLUMNS), 2))
        
        for c1, c2 in comb:
            # Check ratio columns to avoid duplicate columns.
            if c1 in new_RATIO_COLUMNS:
                if c1.split('/')[1] == c2:
                    continue
            elif c2 in new_RATIO_COLUMNS:
                if c2.split('/')[1] == c1:
                    continue
            else:
                new_col = f"{c1}*{c2}"
                df[new_col] = df[c1] * df[c2]
                new_QUAN_COLUMNS.add(new_col)
            
    # Update QUAN_COLUMNS.
    QUAN_COLUMNS.extend(list(new_QUAN_COLUMNS))

    # 3. Target Label Encoding.
    le_target = LabelEncoder()
    train[TARGET] = le_target.fit_transform(train[TARGET])
    
    # Delete Unnecessary Variables for memory Efficiency.
    del new_CAT_COLUMNS, new_QUAN_COLUMNS, new_RATIO_COLUMNS, dfs, scaler
    gc.collect()
    
    return train, test, le_target

In [None]:
def prob_to_top_k_label(prob, k = 3):
    return np.argsort(prob, axis = 1)[:, -k:][:, ::-1]

In [None]:
def MAP3_score(y_true, y_pred, k = 3):
    """
    y_pred : 2D array of shape (n_samples, k_classes) Here, k = 3
    y_true : 1D array of shape (n_samples,)
    """
    weight = np.linspace(1, 0, num = k, endpoint=False)
    return np.mean(np.sum((y_true.reshape(-1,1) == y_pred) * weight, axis = 1))

In [None]:
def make_sub(top_k_preds, le_target):
    # Load Sample Submission
    sample_submission = pd.read_csv("data/sample_submission.csv")

    # Convert top_k_preds to original target labels
    org_y_pred = le_target.inverse_transform(top_k_preds.astype(int))
    sample_submission[TARGET] = [' '.join(x) for x in org_y_pred]

    # Save submission file
    sample_submission.to_csv('submission/logistic_sub.csv', index = False)

    # Return the sample submission DataFrame
    return sample_submission

# Training Model

In [None]:
# Setting up the data.
train, test, le_target = FE_for_gbdt(train, test)
train_X = train.drop(TARGET, axis=1)
train_y = train[TARGET]

In [None]:
train.head()

Unnamed: 0,Temparature,Humidity,Moisture,Nitrogen,Potassium,Phosphorous,Fertilizer Name,Soil Type_Black,Soil Type_Clayey,Soil Type_Loamy,...,Crop Type_Cotton,Crop Type_Ground Nuts,Crop Type_Maize,Crop Type_Millets,Crop Type_Oil seeds,Crop Type_Paddy,Crop Type_Pulses,Crop Type_Sugarcane,Crop Type_Tobacco,Crop Type_Wheat
0,37,70,36,36,4,5,4,0,1,0,...,0,0,0,0,0,0,0,1,0,0
1,27,69,65,30,6,18,4,0,0,0,...,0,0,0,1,0,0,0,0,0,0
2,29,63,32,24,12,16,2,0,0,0,...,0,0,0,1,0,0,0,0,0,0
3,35,62,54,39,12,4,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,35,58,43,37,2,16,5,0,0,0,...,0,0,0,0,0,1,0,0,0,0
