In [93]:
#import libraries
import pandas as pd
import numpy as np
from sklearn import preprocessing
import warnings
from sklearn import metrics
from sklearn.preprocessing import LabelEncoder, StandardScaler
import arabic_reshaper
from bidi import algorithm 

import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
import lightgbm as lgb
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from sklearn.neural_network import MLPClassifier

from sklearn.metrics import classification_report




## Read Datasetes

In [None]:
warnings.filterwarnings("ignore")
# read by default 1st sheet of an excel file
data1 = pd.read_excel('final-data.xlsx')
data1.head()

In [None]:
data1.hist()

In [None]:
# Count the number of null values in each column
null_counts = data1.isnull().sum()

# Print the null counts for each column
print(null_counts)

# Convert the null_counts Series to a DataFrame
null_counts_df = null_counts.to_frame('Null_Count')

# Save the results to an Excel file
null_counts_df.to_excel('null_counts.xlsx')

In [None]:
data1.describe()

In [None]:
data1.dtypes

In [None]:
# Identify categorical columns
categorical_cols = data1.select_dtypes(include=['object']).columns

# Print the categorical columns
print("Categorical columns:", categorical_cols)

# Identify numerical columns
numerical_cols = data1.select_dtypes(include=['float64', 'int64']).columns

# Print the categorical columns
print("numerical columns:", numerical_cols)

## values

In [None]:
df = data1
df.columns = [arabic_reshaper.reshape(column) for column in data1.columns]
# Plotting value counts and distribution for each column separately
for col in df.columns:
    plt.figure(figsize=(8, 6))

    # Value Counts
   # plt.subplot(1, 2, 1)
    #df[col].value_counts().plot(kind='bar', color='skyblue')
    #plt.title(f'Value Counts for {col}')
    #plt.xlabel('Unique Values')
    #plt.ylabel('Count')

    # Distribution
    #plt.subplot(1, 2, 2)
    if df[col].dtype == 'object':
        df[col].value_counts().plot(kind='pie', autopct='%1.1f%%', startangle=90, colors=['lightblue', 'lightgreen', 'lightcoral'])
        plt.title(f'توزیع {arabic_reshaper.reshape(col)}')
    else:
        sns.histplot(df[col], color='skyblue', bins=10)
        plt.title(f'توزیع {arabic_reshaper.reshape(col)}')

    plt.tight_layout()
    plt.show()

## Null values

In [None]:
# Handle missing values in categorical variables
for catcol in categorical_cols:
    #print(catcol)
    mode_value = data1[catcol].mode()[0]
    data1[catcol].fillna(mode_value, inplace=True)
# Calculate the mean of the numerical column with missing values
for numcol in numerical_cols:
    #print(numcol)
    mode_value = data1[numcol].mode()[0]
    #print(mode_value)
    data1[numcol].fillna(mode_value, inplace=True)
# Print the null counts for each column
print(null_counts)

## Class Distribution

In [None]:
import matplotlib.pyplot as plt
# Count the number of occurrences for each class
class_counts = data1['خروج'].value_counts()

# Create a pie chart
plt.pie(class_counts, labels=class_counts.index, autopct='%1.1f%%')

# Add a title
plt.title('Class Distribution')

# Display the chart
plt.show()


## value decsribe

In [None]:
#mearge features and target and plot correlation

data = pd.concat([features, pd.DataFrame(target, columns=['خروج'])], axis=1)
#change column names to persian
data.columns = [arabic_reshaper.reshape(column) for column in data.columns]
data.columns = [algorithm.get_display(column) for column in data.columns]
corr = data.corr()
plt.figure(figsize=(10, 10))
sns.heatmap(corr, cmap='coolwarm', annot=True, fmt='.1f')
plt.show()


## Encode categorical variables

In [None]:
# Encode categorical variables (if any)
encoder = LabelEncoder()
for catcol in categorical_cols:
   data1[catcol] = encoder.fit_transform(data1[catcol])

## Train & Test Split - Scale

In [None]:
# Separate features and target variable
features = data1.drop('خروج', axis=1)
target = data1['خروج']
df_columns = features.columns

## Data importance

In [None]:
#from sklearn.feature_selection import SelectFromModel
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestRegressor

In [None]:
# تعیین مدل استخراج ویژگی
clf = DecisionTreeClassifier()

# انطباق مدل با داده‌های آموزشی
clf.fit(features, target)

# محاسبه اهمیت ویژگی ها
clf_importances = clf.feature_importances_

# Create a DataFrame with feature names and importances
clf_feature_importances = pd.DataFrame({'Feature': features.columns, 'Importance': clf_importances})


# Sort the features by importance in descending order
clf_feature_importances = clf_feature_importances.sort_values(by='Importance', ascending=False)

# Print the feature importances
print(clf_feature_importances)


# Save the results to an Excel file
clf_feature_importances.to_excel('clf_feature_importances.xlsx')

In [None]:
from sklearn.ensemble import RandomForestRegressor

# Create the random forest regressor model
rf_model = RandomForestRegressor()

# Fit the model to the data
rf_model.fit(features, target)

# Get feature importances
rf_importances = rf_model.feature_importances_

# Create a DataFrame with feature names and importances
rf_feature_importances = pd.DataFrame({'Feature': features.columns, 'Importance': rf_importances})

# Sort the features by importance in descending order
rf_feature_importances = rf_feature_importances.sort_values(by='Importance', ascending=False)

# Print the feature importances
print(rf_feature_importances)

# Save the results to an Excel file
rf_feature_importances.to_excel('rf_feature_importances.xlsx')


In [None]:
#حذف ویژگی بازگشتی
from sklearn.feature_selection import RFE
from sklearn.ensemble import RandomForestClassifier


# Create the random forest classifier model
rfc_model = RandomForestClassifier()

# Create the RFE selector with the random forest classifier
rfe_selector = RFE(estimator=rfc_model, n_features_to_select=11, step=1)

# Fit the RFE selector to the data
rfe_selector.fit(features, target)

# Get the rankings of the features
feature_rankings = rfe_selector.ranking_

# Get the selected feature names
selected_features = features.columns[rfe_selector.support_]

# Print the selected features and their rankings
for feature, ranking in zip(selected_features, feature_rankings):
    print(f"{feature}: {ranking}")


In [None]:
#درختان اضافی
from sklearn.ensemble import ExtraTreesClassifier

# Create the Extra Trees classifier model
et_model = ExtraTreesClassifier()

# Fit the model to the data
et_model.fit(features, target)

# Get feature importances
et_feature_importances = et_model.feature_importances_



# Create a DataFrame with feature names and importances
et_feature_importances = pd.DataFrame({'Feature': features.columns, 'Importance': et_feature_importances})

# Sort the features by importance in descending order
et_feature_importances = et_feature_importances.sort_values(by='Importance', ascending=False)

print(et_feature_importances)

# Save the results to an Excel file
et_feature_importances.to_excel('et_feature_importances.xlsx')

In [None]:
from scipy.stats import chi2_contingency

# Perform chi-square test for each feature
chi2_feature_importances = []
for feature in features.columns:
    contingency_table = pd.crosstab(features[feature], target)
    chi2, p_value, _, _ = chi2_contingency(contingency_table)
    chi2_feature_importances.append(( chi2))

# Create a DataFrame with feature names and importances
chi2_feature_importances = pd.DataFrame({'Feature': features.columns, 'Importance': chi2_feature_importances})

# Sort the features by importance in descending order
chi2_feature_importances = chi2_feature_importances.sort_values(by='Importance', ascending=False)

print(chi2_feature_importances)

# Save the results to an Excel file
chi2_feature_importances.to_excel('chi2_feature_importances.xlsx')

In [None]:
from sklearn.linear_model import Lasso


# Create and fit the Lasso regression model
lasso = Lasso(alpha=0.1)  # alpha is the regularization strength
lasso.fit(features, target)

# Get the feature importance scores
l_feature_importances = np.abs(lasso.coef_)

# Create a DataFrame with feature names and importances
l_feature_importances = pd.DataFrame({'Feature': features.columns, 'Importance': l_feature_importances})

# Sort the features by importance in descending order
l_feature_importances = l_feature_importances.sort_values(by='Importance', ascending=False)

print(l_feature_importances)


## SMOTE on Train data

In [None]:
# Split the data into training and testing sets (optional),add stratify
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.3, random_state=42, stratify=target)

In [None]:
# Scale the numerical columns
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


In [None]:
print('No of ROWs before SMOTE')
print('y_train:\n',y_train.value_counts())
print('y_test:\n',y_test.value_counts())

In [None]:
#SMOTE
from imblearn.over_sampling import SMOTE
sm = SMOTE(sampling_strategy=0.033)
X_train_s, y_train_s = sm.fit_resample(X_train, y_train)

In [None]:
print('No of ROWs after SMOTE')
print('y_train:\n',y_train_s.value_counts())
print('y_train:\n',y_test.value_counts())

In [None]:
features

## k-fold


In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import load_iris
from sklearn.model_selection import cross_val_predict, StratifiedKFold
from sklearn.metrics import classification_report


# Initialize the RandomForestClassifier

# Initialize StratifiedKFold with 5 folds
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
print("LGBMClassifier")
# Perform k-fold cross-validation and generate classification report for each fold
for fold_idx, (train_idx, test_idx) in enumerate(kf.split(features, target), 1):
    X_train, X_test = features.loc[train_idx], features.loc[test_idx]
    y_train, y_test = target.loc[train_idx], target.loc[test_idx]
    
    sm = SMOTE(sampling_strategy=0.062)
    X_train_s, y_train_s = sm.fit_resample(X_train, y_train)
    rf_clf = lgb.LGBMClassifier(random_state=42)
    rf_clf.fit(X_train_s, y_train_s)
    y_pred = rf_clf.predict(X_test)
    class_report = classification_report(y_test, y_pred)
    print(f'Classification Report for Fold {fold_idx}:')
    print(class_report)


In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import load_iris
from sklearn.model_selection import cross_val_predict, StratifiedKFold
from sklearn.metrics import classification_report


# Initialize the RandomForestClassifier

# Initialize StratifiedKFold with 5 folds
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
print("XGBClassifier")
# Perform k-fold cross-validation and generate classification report for each fold
for fold_idx, (train_idx, test_idx) in enumerate(kf.split(features, target), 1):
    X_train, X_test = features.loc[train_idx], features.loc[test_idx]
    y_train, y_test = target.loc[train_idx], target.loc[test_idx]
    
    sm = SMOTE(sampling_strategy=0.062)
    X_train_s, y_train_s = sm.fit_resample(X_train, y_train)
    rf_clf = XGBClassifier(random_state=42)
    rf_clf.fit(X_train_s, y_train_s)
    y_pred = rf_clf.predict(X_test)
    class_report = classification_report(y_test, y_pred)
    print(f'Classification Report for Fold {fold_idx}:')
    print(class_report)


In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import load_iris
from sklearn.model_selection import cross_val_predict, StratifiedKFold
from sklearn.metrics import classification_report


# Initialize the RandomForestClassifier

# Initialize StratifiedKFold with 5 folds
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
print("RandomForestClassifier")

# Perform k-fold cross-validation and generate classification report for each fold
for fold_idx, (train_idx, test_idx) in enumerate(kf.split(features, target), 1):
    X_train, X_test = features.loc[train_idx], features.loc[test_idx]
    y_train, y_test = target.loc[train_idx], target.loc[test_idx]
    
    sm = SMOTE(sampling_strategy=0.062)
    X_train_s, y_train_s = sm.fit_resample(X_train, y_train)
    rf_clf = RandomForestClassifier(random_state=42)
    rf_clf.fit(X_train_s, y_train_s)
    y_pred = rf_clf.predict(X_test)
    class_report = classification_report(y_test, y_pred)
    print(f'Classification Report for Fold {fold_idx}:')
    print(class_report)
