In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score
from sklearn.linear_model import LogisticRegression
from imblearn.over_sampling import SMOTE
from sklearn.feature_selection import SelectKBest, f_classif

def train_logistic_regression():
    print("Training with logistic regression")
    data_df = pd.read_csv('/content/drive/My Drive/preprocessed_data.csv')
    used_features = ["AMT_INCOME_TOTAL", "NAME_EDUCATION_TYPE", "FLAG_OWN_REALTY", "DAYS_EMPLOYED", "CNT_CHILDREN",
                     "NAME_HOUSING_TYPE", "FLAG_OWN_CAR", "DAYS_BIRTH", "GOOD_REPUTATION"]
    data_df = data_df[used_features]

    # Split data into features and target variable
    X = data_df.drop('GOOD_REPUTATION', axis=1)
    y = data_df['GOOD_REPUTATION']

    # Feature Scaling
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    # Polynomial Features
    poly = PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)
    X_poly = poly.fit_transform(X_scaled)

    # Feature Selection
    selector = SelectKBest(f_classif, k=8)
    X_selected = selector.fit_transform(X_poly, y)

    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.2, random_state=42)

    # Apply SMOTE to the training data for oversampling
    smote = SMOTE(random_state=42)
    X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

    # Initialize and train the logistic regression model with class weight
    model = LogisticRegression(max_iter=300, solver='liblinear', penalty='l2', C=0.1)
    model.fit(X_train_smote, y_train_smote)

    # Make predictions
    predictions = model.predict(X_test)

    # Evaluate the model
    accuracy = accuracy_score(y_test, predictions)
    conf_matrix = confusion_matrix(y_test, predictions)
    f1 = f1_score(y_test, predictions)

    print("Accuracy:", accuracy)
    print("Confusion Matrix:")
    print(conf_matrix)
    print(f"f1: {f1}")

if __name__ == "__main__":
    train_logistic_regression()


Training with logistic regression
Accuracy: 0.5670597915523862
Confusion Matrix:
[[ 440  432]
 [2725 3695]]
f1: 0.7006731772067887
