In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from tqdm.notebook import tqdm
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, balanced_accuracy_score
from sklearn.utils import all_estimators



In [2]:
#set this to the root directory of the project
path_root_dir="./"
data = pd.read_csv(path_root_dir+"/data-science-Optimal-EV-station-placement/data/processed/all_city_data_with_pop.csv")

In [3]:
data.head()

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,geometry,parking,edges,EV_stations,parking_space,civic,restaurant,park,...,cinema,library,commercial,retail,townhall,government,residential,city,population,Berlin_data_onlycenter_
0,0,0,"POLYGON ((8.4727605 50.099822499999995, 8.4730...",0,0,0,0,0,0,0,...,0,0,0,0,0,0.0,0,Frankfurt,9.014051,
1,1,1,"POLYGON ((8.4775730092433 50.10302720327834, 8...",0,0,0,0,0,0,0,...,0,0,0,0,0,0.0,0,Frankfurt,0.0,
2,2,2,"POLYGON ((8.479750879173663 50.09863320231676,...",0,0,0,0,0,0,0,...,0,0,0,0,0,0.0,0,Frankfurt,9.014051,
3,3,3,"POLYGON ((8.479688060978736 50.10443297769501,...",0,0,0,0,0,0,0,...,0,0,0,0,0,0.0,0,Frankfurt,9.014051,
4,4,4,"POLYGON ((8.47965547981383 50.107440331063444,...",0,0,0,0,0,0,0,...,0,0,0,0,0,0.0,0,Frankfurt,0.0,


In [4]:
#filtering out columsn to be used for modeling
data = data[['geometry','city','EV_stations', 'parking', 'edges',
        'parking_space', 'civic', 'restaurant', 'park', 'school',
       'node', 'Community_centre', 'place_of_worship', 'university', 'cinema',
       'library', 'commercial', 'retail', 'townhall', 'government',
       'residential', 'population']]
print("data size:" , data.shape)
data = data.dropna()
print("data size after dropping na:" , data.shape)
data.head()

data size: (10824, 22)
data size after dropping na: (10129, 22)


Unnamed: 0,geometry,city,EV_stations,parking,edges,parking_space,civic,restaurant,park,school,...,place_of_worship,university,cinema,library,commercial,retail,townhall,government,residential,population
0,"POLYGON ((8.4727605 50.099822499999995, 8.4730...",Frankfurt,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0.0,0,9.014051
1,"POLYGON ((8.4775730092433 50.10302720327834, 8...",Frankfurt,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0.0,0,0.0
2,"POLYGON ((8.479750879173663 50.09863320231676,...",Frankfurt,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0.0,0,9.014051
3,"POLYGON ((8.479688060978736 50.10443297769501,...",Frankfurt,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0.0,0,9.014051
4,"POLYGON ((8.47965547981383 50.107440331063444,...",Frankfurt,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0.0,0,0.0


In [5]:
def data_splitter(data, train_cities=None, test_cities=None, test_size=0.2, random_state=42):

    if train_cities is not None:
        train = data[data['city'].isin(train_cities)]
        test = data[data['city'].isin(test_cities)]


        X_train = train.drop(['city','geometry', 'EV_stations'], axis=1)
        y_train = train['EV_stations'].astype(int)
        y_train = y_train.apply(lambda x: 1 if x > 0 else 0)

        X_test = test.drop(['city','geometry', 'EV_stations'], axis=1)
        y_test = test['EV_stations'].astype(int)
        y_test = y_test.apply(lambda x: 1 if x > 0 else 0)
    else:
        X = data.drop(['city','geometry', "EV_stations"], axis=1)
        y = data['EV_stations']
        y = y.apply(lambda x: 1 if x > 0 else 0)
        X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=test_size, random_state=random_state)

    return X_train, X_test, y_train, y_test

In [6]:
X_train, X_test, y_train, y_test = data_splitter(data)

In [7]:

# logistic regression
logreg = LogisticRegression()
logreg.fit(X_train, y_train)
print("Logistic Regression Test Accuracy: ", logreg.score(X_test, y_test))
# classification report
y_pred = logreg.predict(X_test)
print(classification_report(y_test, y_pred))

Logistic Regression Test Accuracy:  0.8978282329713722
              precision    recall  f1-score   support

           0       0.91      0.98      0.94      1786
           1       0.64      0.31      0.42       240

    accuracy                           0.90      2026
   macro avg       0.78      0.64      0.68      2026
weighted avg       0.88      0.90      0.88      2026



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [8]:
# Get all classification model classes
classifiers = all_estimators(type_filter='classifier')

# Initialize result table
results = []
models = {}
# Run models and collect results
for name, ClassifierClass in tqdm(classifiers):
    try:
        # Initialize model
        model = ClassifierClass()
        model.fit(X_train, y_train)
        models[name] = model
        y_pred = model.predict(X_test)
        
        # Calculate metrics
        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred, average='macro')
        recall = recall_score(y_test, y_pred, average='macro')
        f1 = f1_score(y_test, y_pred, average='macro')
        auc = roc_auc_score(y_test, y_pred)
        balanced_accuracy = balanced_accuracy_score(y_test, y_pred)
        
        # Append results
        results.append([name, accuracy, precision, recall, f1, auc, balanced_accuracy])
    except Exception as e:
        print(f"Error occurred for {name}: {str(e)}")

# Create a DataFrame from results
results_df = pd.DataFrame(results, columns=["Model", "Accuracy", "Precision", "Recall", "F1-score", "AUC", "Balanced Accuracy"])
results_df = results_df.sort_values(by=['F1-score', 'AUC'], ascending=False)
print(results_df)


  0%|          | 0/41 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))


Error occurred for CategoricalNB: index 7 is out of bounds for axis 1 with size 7
Error occurred for ClassifierChain: _BaseChain.__init__() missing 1 required positional argument: 'base_estimator'


  probabilities /= normalizer
  probabilities /= normalizer
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Pleas

Error occurred for MultiOutputClassifier: MultiOutputClassifier.__init__() missing 1 required positional argument: 'estimator'
Error occurred for NuSVC: specified nu is infeasible
Error occurred for OneVsOneClassifier: OneVsOneClassifier.__init__() missing 1 required positional argument: 'estimator'
Error occurred for OneVsRestClassifier: OneVsRestClassifier.__init__() missing 1 required positional argument: 'estimator'
Error occurred for OutputCodeClassifier: OutputCodeClassifier.__init__() missing 1 required positional argument: 'estimator'
Error occurred for RadiusNeighborsClassifier: No neighbors found for test samples array([   0,    1,    2, ..., 2022, 2023, 2025]), you can try using larger radius, giving a label for outliers, or considering removing them from your dataset.
Error occurred for StackingClassifier: StackingClassifier.__init__() missing 1 required positional argument: 'estimators'
Error occurred for VotingClassifier: VotingClassifier.__init__() missing 1 required pos

In [16]:
def run_experiment(X_train, X_test, y_train, y_test):
    # Get all classification model classes
    classifiers = all_estimators(type_filter='classifier')

    # Initialize result table
    results = []
    models = {}
    # Run models and collect results
    for name, ClassifierClass in tqdm(classifiers):
        try:
            # Initialize model
            model = ClassifierClass()
            model.fit(X_train, y_train)
            models[name] = model
            y_pred = model.predict(X_test)
            
            # Calculate metrics
            accuracy = accuracy_score(y_test, y_pred)
            precision = precision_score(y_test, y_pred, average='macro')
            recall = recall_score(y_test, y_pred, average='macro')
            f1 = f1_score(y_test, y_pred, average='macro')
            auc = roc_auc_score(y_test, y_pred)
            balanced_accuracy = balanced_accuracy_score(y_test, y_pred)
            
            # Append results
            results.append([name, accuracy, precision, recall, f1, auc, balanced_accuracy])
        except Exception as e:
            print(f"Error occurred for {name}: {str(e)}")

    # Create a DataFrame from results
    results_df = pd.DataFrame(results, columns=["Model", "Accuracy", "Precision", "Recall", "F1-score", "AUC", "Balanced Accuracy"])
    results_df = results_df.sort_values(by=['F1-score', 'AUC'], ascending=False)
    return results_df, models


In [17]:
result_df, models = run_experiment(X_train, X_test, y_train, y_test)

  0%|          | 0/41 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Error occurred for CategoricalNB: index 47 is out of bounds for axis 1 with size 33
Error occurred for ClassifierChain: _BaseChain.__init__() missing 1 required positional argument: 'base_estimator'


  probabilities /= normalizer
  probabilities /= normalizer
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Pleas

Error occurred for MultiOutputClassifier: MultiOutputClassifier.__init__() missing 1 required positional argument: 'estimator'
Error occurred for NuSVC: specified nu is infeasible
Error occurred for OneVsOneClassifier: OneVsOneClassifier.__init__() missing 1 required positional argument: 'estimator'
Error occurred for OneVsRestClassifier: OneVsRestClassifier.__init__() missing 1 required positional argument: 'estimator'
Error occurred for OutputCodeClassifier: OutputCodeClassifier.__init__() missing 1 required positional argument: 'estimator'
Error occurred for RadiusNeighborsClassifier: No neighbors found for test samples array([  0,   2,   3,   8,  11,  12,  14,  16,  17,  19,  21,  22,  24,
        26,  27,  28,  29,  30,  33,  35,  36,  37,  38,  39,  40,  43,
        44,  45,  46,  47,  48,  49,  50,  54,  56,  57,  58,  59,  60,
        61,  62,  63,  64,  65,  66,  67,  68,  72,  73,  74,  75,  77,
        78,  79,  80,  81,  82,  83,  84,  86,  87,  89,  90,  91,  92,
        9

  _warn_prf(average, modifier, msg_start, len(result))


In [18]:
results_df

Unnamed: 0,Model,Accuracy,Precision,Recall,F1-score,AUC,Balanced Accuracy
5,DecisionTreeClassifier,0.918699,0.682059,0.652829,0.665761,0.652829,0.652829
2,BernoulliNB,0.847561,0.633567,0.81241,0.665443,0.81241,0.81241
1,BaggingClassifier,0.922764,0.694304,0.641826,0.662771,0.641826,0.641826
18,LogisticRegression,0.914634,0.658303,0.624258,0.638514,0.624258,0.624258
21,MultinomialNB,0.894309,0.626664,0.652892,0.637969,0.652892,0.652892
23,PassiveAggressiveClassifier,0.920732,0.676886,0.614348,0.636859,0.614348,0.614348
26,RandomForestClassifier,0.934959,0.789076,0.595624,0.635116,0.595624,0.595624
7,ExtraTreeClassifier,0.920732,0.671398,0.601157,0.624464,0.601157,0.601157
4,ComplementNB,0.880081,0.605113,0.645233,0.620127,0.645233,0.645233
0,AdaBoostClassifier,0.922764,0.676409,0.589059,0.614229,0.589059,0.589059


In [19]:
results_df.to_csv("/home/bahramkhanbaloch/media3/MS/Semester1/DS/results/all_cities_random_shuffle.csv", index=False)

In [11]:
"""
Berlin, Munich, Stuttgart, Frankfurt: Big CITY EXP-1
Kalsruhe, trier, saarbrucken, mainz: EXP-2
"""

# EXP-1
big_cities = ['Berlin', 'Munich', 'Stuttgart', 'Frankfurt']
small_cities = ['Karlsruhe', 'Trier', 'Saarbrücken', 'Mainz']


# make a table in the end to summarise the results of all experiments

# big cities splited in trian and test where only one big city is test and all possible combinations for this
for city in tqdm(big_cities):
    test_cities = [city]
    train_cities = [x for x in big_cities if x != city]
    X_train, X_test, y_train, y_test = data_splitter(data, train_cities=train_cities, test_cities=test_cities)
    results_df, models = run_experiment(X_train, X_test, y_train, y_test)
    results_df.to_csv(f"/home/bahramkhanbaloch/media3/MS/Semester1/DS/results/big_cities_test_city_{city}_.csv", index=False)
    



# small cities splited in trian and test where only one small city is test and all possible combinations for this
for city in tqdm(small_cities):
    test_cities = [city]
    train_cities = [x for x in small_cities if x != city]
    X_train, X_test, y_train, y_test = data_splitter(data, train_cities=train_cities, test_cities=test_cities)
    results_df, models = run_experiment(X_train, X_test, y_train, y_test)
    results_df.to_csv(f"/home/bahramkhanbaloch/media3/MS/Semester1/DS/results/small_cities_test_city_{city}_.csv", index=False)

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/41 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))


Error occurred for CategoricalNB: index 51 is out of bounds for axis 1 with size 51
Error occurred for ClassifierChain: _BaseChain.__init__() missing 1 required positional argument: 'base_estimator'


  probabilities /= normalizer
  probabilities /= normalizer
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Pleas

Error occurred for MultiOutputClassifier: MultiOutputClassifier.__init__() missing 1 required positional argument: 'estimator'
Error occurred for NuSVC: specified nu is infeasible
Error occurred for OneVsOneClassifier: OneVsOneClassifier.__init__() missing 1 required positional argument: 'estimator'
Error occurred for OneVsRestClassifier: OneVsRestClassifier.__init__() missing 1 required positional argument: 'estimator'
Error occurred for OutputCodeClassifier: OutputCodeClassifier.__init__() missing 1 required positional argument: 'estimator'
Error occurred for RadiusNeighborsClassifier: No neighbors found for test samples array([   2,    9,   17, ..., 3877, 3878, 3879]), you can try using larger radius, giving a label for outliers, or considering removing them from your dataset.
Error occurred for StackingClassifier: StackingClassifier.__init__() missing 1 required positional argument: 'estimators'
Error occurred for VotingClassifier: VotingClassifier.__init__() missing 1 required pos

  0%|          | 0/41 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))


Error occurred for CategoricalNB: index 53 is out of bounds for axis 1 with size 48
Error occurred for ClassifierChain: _BaseChain.__init__() missing 1 required positional argument: 'base_estimator'


  probabilities /= normalizer
  probabilities /= normalizer
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Pleas

Error occurred for MultiOutputClassifier: MultiOutputClassifier.__init__() missing 1 required positional argument: 'estimator'
Error occurred for NuSVC: specified nu is infeasible
Error occurred for OneVsOneClassifier: OneVsOneClassifier.__init__() missing 1 required positional argument: 'estimator'
Error occurred for OneVsRestClassifier: OneVsRestClassifier.__init__() missing 1 required positional argument: 'estimator'
Error occurred for OutputCodeClassifier: OutputCodeClassifier.__init__() missing 1 required positional argument: 'estimator'
Error occurred for RadiusNeighborsClassifier: No neighbors found for test samples array([  15,   17,   22, ..., 1409, 1410, 1420]), you can try using larger radius, giving a label for outliers, or considering removing them from your dataset.
Error occurred for StackingClassifier: StackingClassifier.__init__() missing 1 required positional argument: 'estimators'
Error occurred for VotingClassifier: VotingClassifier.__init__() missing 1 required pos

  0%|          | 0/41 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Error occurred for CategoricalNB: index 13 is out of bounds for axis 1 with size 10
Error occurred for ClassifierChain: _BaseChain.__init__() missing 1 required positional argument: 'base_estimator'


  _warn_prf(average, modifier, msg_start, len(result))
  probabilities /= normalizer
  probabilities /= normalizer
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://s

Error occurred for MultiOutputClassifier: MultiOutputClassifier.__init__() missing 1 required positional argument: 'estimator'
Error occurred for NuSVC: specified nu is infeasible
Error occurred for OneVsOneClassifier: OneVsOneClassifier.__init__() missing 1 required positional argument: 'estimator'
Error occurred for OneVsRestClassifier: OneVsRestClassifier.__init__() missing 1 required positional argument: 'estimator'
Error occurred for OutputCodeClassifier: OutputCodeClassifier.__init__() missing 1 required positional argument: 'estimator'
Error occurred for RadiusNeighborsClassifier: No neighbors found for test samples array([ 11,  20,  21,  23,  27,  30,  31,  32,  39,  40,  41,  42,  43,
        44,  45,  46,  49,  50,  51,  52,  54,  55,  56,  57,  58,  59,
        64,  65,  66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  77,
        79,  80,  81,  82,  84,  85,  86,  95,  98,  99, 100, 101, 102,
       103, 104, 106, 107, 108, 109, 110, 111, 112, 119, 122, 123, 124,
       12

  0%|          | 0/41 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))


Error occurred for CategoricalNB: index 12 is out of bounds for axis 1 with size 11
Error occurred for ClassifierChain: _BaseChain.__init__() missing 1 required positional argument: 'base_estimator'


  probabilities /= normalizer
  probabilities /= normalizer
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Pleas

Error occurred for MultiOutputClassifier: MultiOutputClassifier.__init__() missing 1 required positional argument: 'estimator'
Error occurred for NuSVC: specified nu is infeasible
Error occurred for OneVsOneClassifier: OneVsOneClassifier.__init__() missing 1 required positional argument: 'estimator'
Error occurred for OneVsRestClassifier: OneVsRestClassifier.__init__() missing 1 required positional argument: 'estimator'
Error occurred for OutputCodeClassifier: OutputCodeClassifier.__init__() missing 1 required positional argument: 'estimator'
Error occurred for RadiusNeighborsClassifier: No neighbors found for test samples array([   6,   12,   13,   19,   20,   21,   27,   28,   29,   30,   31,
         32,   33,   41,   43,   44,   45,   46,   55,   56,   57,   58,
         59,   60,   61,   66,   67,   68,   69,   70,   71,   79,   80,
         81,   82,   83,   84,   86,   88,   89,   90,   91,   92,   93,
         94,   95,   96,   97,  101,  102,  103,  104,  105,  106,  107,
    

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/41 [00:00<?, ?it/s]

Error occurred for AdaBoostClassifier: Found array with 0 sample(s) (shape=(0, 19)) while a minimum of 1 is required by AdaBoostClassifier.
Error occurred for BaggingClassifier: Found array with 0 sample(s) (shape=(0, 19)) while a minimum of 1 is required by BaggingClassifier.
Error occurred for BernoulliNB: Found array with 0 sample(s) (shape=(0, 19)) while a minimum of 1 is required by BernoulliNB.


  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)
  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)
  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)
  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)


Error occurred for CalibratedClassifierCV: Found array with 0 sample(s) (shape=(0, 19)) while a minimum of 1 is required by LinearSVC.
Error occurred for CategoricalNB: Found array with 0 sample(s) (shape=(0, 19)) while a minimum of 1 is required by CategoricalNB.
Error occurred for ClassifierChain: _BaseChain.__init__() missing 1 required positional argument: 'base_estimator'
Error occurred for ComplementNB: Found array with 0 sample(s) (shape=(0, 19)) while a minimum of 1 is required by ComplementNB.
Error occurred for DecisionTreeClassifier: Found array with 0 sample(s) (shape=(0, 19)) while a minimum of 1 is required by DecisionTreeClassifier.
Error occurred for DummyClassifier: Found array with 0 sample(s) (shape=(0,)) while a minimum of 1 is required.
Error occurred for ExtraTreeClassifier: Found array with 0 sample(s) (shape=(0, 19)) while a minimum of 1 is required by ExtraTreeClassifier.
Error occurred for ExtraTreesClassifier: Found array with 0 sample(s) (shape=(0, 19)) whil

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Error occurred for LogisticRegression: Found array with 0 sample(s) (shape=(0, 19)) while a minimum of 1 is required by LogisticRegression.


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Error occurred for LogisticRegressionCV: Found array with 0 sample(s) (shape=(0, 19)) while a minimum of 1 is required by LogisticRegressionCV.
Error occurred for MLPClassifier: Found array with 0 sample(s) (shape=(0, 19)) while a minimum of 1 is required by MLPClassifier.
Error occurred for MultiOutputClassifier: MultiOutputClassifier.__init__() missing 1 required positional argument: 'estimator'
Error occurred for MultinomialNB: Found array with 0 sample(s) (shape=(0, 19)) while a minimum of 1 is required by MultinomialNB.
Error occurred for NearestCentroid: Found array with 0 sample(s) (shape=(0, 19)) while a minimum of 1 is required by NearestCentroid.
Error occurred for NuSVC: specified nu is infeasible
Error occurred for OneVsOneClassifier: OneVsOneClassifier.__init__() missing 1 required positional argument: 'estimator'
Error occurred for OneVsRestClassifier: OneVsRestClassifier.__init__() missing 1 required positional argument: 'estimator'
Error occurred for OutputCodeClassifie

  0%|          | 0/41 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Error occurred for CategoricalNB: index 87 is out of bounds for axis 1 with size 46
Error occurred for ClassifierChain: _BaseChain.__init__() missing 1 required positional argument: 'base_estimator'


  probabilities /= normalizer
  probabilities /= normalizer
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Pleas

Error occurred for MultiOutputClassifier: MultiOutputClassifier.__init__() missing 1 required positional argument: 'estimator'
Error occurred for NuSVC: specified nu is infeasible
Error occurred for OneVsOneClassifier: OneVsOneClassifier.__init__() missing 1 required positional argument: 'estimator'
Error occurred for OneVsRestClassifier: OneVsRestClassifier.__init__() missing 1 required positional argument: 'estimator'
Error occurred for OutputCodeClassifier: OutputCodeClassifier.__init__() missing 1 required positional argument: 'estimator'
Error occurred for RadiusNeighborsClassifier: No neighbors found for test samples array([  7,  11,  15,  21,  25,  27,  28,  29,  39,  40,  41,  42,  43,
        47,  51,  52,  53,  54,  55,  58,  59,  63,  64,  65,  66,  70,
        71,  72,  73,  74,  75,  76,  82,  83,  84,  85,  86,  87,  88,
        90,  91,  92,  93,  94,  95,  96,  97,  98,  99, 102, 103, 104,
       105, 106, 107, 108, 109, 110, 112, 115, 116, 120, 121, 122, 123,
       12

  _warn_prf(average, modifier, msg_start, len(result))


  0%|          | 0/41 [00:00<?, ?it/s]



Error occurred for AdaBoostClassifier: Found array with 0 sample(s) (shape=(0, 19)) while a minimum of 1 is required by AdaBoostClassifier.
Error occurred for BaggingClassifier: Found array with 0 sample(s) (shape=(0, 19)) while a minimum of 1 is required by BaggingClassifier.
Error occurred for BernoulliNB: Found array with 0 sample(s) (shape=(0, 19)) while a minimum of 1 is required by BernoulliNB.


  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)
  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)
  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)
  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)


Error occurred for CalibratedClassifierCV: Found array with 0 sample(s) (shape=(0, 19)) while a minimum of 1 is required by LinearSVC.
Error occurred for CategoricalNB: Found array with 0 sample(s) (shape=(0, 19)) while a minimum of 1 is required by CategoricalNB.
Error occurred for ClassifierChain: _BaseChain.__init__() missing 1 required positional argument: 'base_estimator'
Error occurred for ComplementNB: Found array with 0 sample(s) (shape=(0, 19)) while a minimum of 1 is required by ComplementNB.
Error occurred for DecisionTreeClassifier: Found array with 0 sample(s) (shape=(0, 19)) while a minimum of 1 is required by DecisionTreeClassifier.
Error occurred for DummyClassifier: Found array with 0 sample(s) (shape=(0,)) while a minimum of 1 is required.
Error occurred for ExtraTreeClassifier: Found array with 0 sample(s) (shape=(0, 19)) while a minimum of 1 is required by ExtraTreeClassifier.
Error occurred for ExtraTreesClassifier: Found array with 0 sample(s) (shape=(0, 19)) whil

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Error occurred for LogisticRegressionCV: Found array with 0 sample(s) (shape=(0, 19)) while a minimum of 1 is required by LogisticRegressionCV.
Error occurred for MLPClassifier: Found array with 0 sample(s) (shape=(0, 19)) while a minimum of 1 is required by MLPClassifier.
Error occurred for MultiOutputClassifier: MultiOutputClassifier.__init__() missing 1 required positional argument: 'estimator'
Error occurred for MultinomialNB: Found array with 0 sample(s) (shape=(0, 19)) while a minimum of 1 is required by MultinomialNB.
Error occurred for NearestCentroid: Found array with 0 sample(s) (shape=(0, 19)) while a minimum of 1 is required by NearestCentroid.
Error occurred for NuSVC: specified nu is infeasible
Error occurred for OneVsOneClassifier: OneVsOneClassifier.__init__() missing 1 required positional argument: 'estimator'
Error occurred for OneVsRestClassifier: OneVsRestClassifier.__init__() missing 1 required positional argument: 'estimator'
Error occurred for OutputCodeClassifie

  0%|          | 0/41 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))


Error occurred for CategoricalNB: index 47 is out of bounds for axis 1 with size 33
Error occurred for ClassifierChain: _BaseChain.__init__() missing 1 required positional argument: 'base_estimator'


  _warn_prf(average, modifier, msg_start, len(result))
  probabilities /= normalizer
  probabilities /= normalizer
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://s

Error occurred for MultiOutputClassifier: MultiOutputClassifier.__init__() missing 1 required positional argument: 'estimator'
Error occurred for NuSVC: specified nu is infeasible
Error occurred for OneVsOneClassifier: OneVsOneClassifier.__init__() missing 1 required positional argument: 'estimator'
Error occurred for OneVsRestClassifier: OneVsRestClassifier.__init__() missing 1 required positional argument: 'estimator'
Error occurred for OutputCodeClassifier: OutputCodeClassifier.__init__() missing 1 required positional argument: 'estimator'
Error occurred for RadiusNeighborsClassifier: No neighbors found for test samples array([  0,   2,   3,   8,  11,  12,  14,  16,  17,  19,  21,  22,  24,
        26,  27,  28,  29,  30,  33,  35,  36,  37,  38,  39,  40,  43,
        44,  45,  46,  47,  48,  49,  50,  54,  56,  57,  58,  59,  60,
        61,  62,  63,  64,  65,  66,  67,  68,  72,  73,  74,  75,  77,
        78,  79,  80,  81,  82,  83,  84,  86,  87,  89,  90,  91,  92,
        9

  _warn_prf(average, modifier, msg_start, len(result))


In [14]:
results_df

Unnamed: 0,Model,Accuracy,Precision,Recall,F1-score,AUC,Balanced Accuracy
5,DecisionTreeClassifier,0.918699,0.682059,0.652829,0.665761,0.652829,0.652829
2,BernoulliNB,0.847561,0.633567,0.81241,0.665443,0.81241,0.81241
1,BaggingClassifier,0.922764,0.694304,0.641826,0.662771,0.641826,0.641826
18,LogisticRegression,0.914634,0.658303,0.624258,0.638514,0.624258,0.624258
21,MultinomialNB,0.894309,0.626664,0.652892,0.637969,0.652892,0.652892
23,PassiveAggressiveClassifier,0.920732,0.676886,0.614348,0.636859,0.614348,0.614348
26,RandomForestClassifier,0.934959,0.789076,0.595624,0.635116,0.595624,0.595624
7,ExtraTreeClassifier,0.920732,0.671398,0.601157,0.624464,0.601157,0.601157
4,ComplementNB,0.880081,0.605113,0.645233,0.620127,0.645233,0.645233
0,AdaBoostClassifier,0.922764,0.676409,0.589059,0.614229,0.589059,0.589059


In [20]:
import pandas as pd
import glob

# Get a list of all result files from different experiments
result_files = glob.glob("/home/bahramkhanbaloch/media3/MS/Semester1/DS/results/*.csv")

# Create a dictionary to store the total AUC and count for each model
auc_sum_per_model = {}
count_per_model = {}

# Iterate over each result file
for file in result_files:
    print(file)
    # Load the results for each experiment
    results = pd.read_csv(file)
    
    # Iterate over each row in the results
    for _, row in results.iterrows():
        model = row['Model']
        auc = row['AUC']
        
        # Update the total AUC and count for the model
        if model in auc_sum_per_model:
            auc_sum_per_model[model] += auc
            count_per_model[model] += 1
        else:
            auc_sum_per_model[model] = auc
            count_per_model[model] = 1

# Calculate the average AUC for each model
average_auc_per_model = {model: auc_sum_per_model[model] / count_per_model[model] for model in auc_sum_per_model}

# Create a DataFrame from the average AUC dictionary
average_auc_df = pd.DataFrame(list(average_auc_per_model.items()), columns=['Model', 'Average AUC'])

# Sort the DataFrame by Average AUC in descending order
sorted_models = average_auc_df.sort_values(by='Average AUC', ascending=False)

# Select the top 5 models
top_5_models = sorted_models.head(5)

# Display the best models
print(top_5_models)


/home/bahramkhanbaloch/media3/MS/Semester1/DS/results/all_cities_random_shuffle.csv
/home/bahramkhanbaloch/media3/MS/Semester1/DS/results/small_cities_test_city_Mainz_.csv
/home/bahramkhanbaloch/media3/MS/Semester1/DS/results/small_cities_test_city_Trier_.csv
/home/bahramkhanbaloch/media3/MS/Semester1/DS/results/big_cities_test_city_Munich_.csv
/home/bahramkhanbaloch/media3/MS/Semester1/DS/results/big_cities_test_city_Berlin_.csv
/home/bahramkhanbaloch/media3/MS/Semester1/DS/results/small_cities_test_city_Saarbrücken_.csv
/home/bahramkhanbaloch/media3/MS/Semester1/DS/results/big_cities_test_city_Stuttgart_.csv
/home/bahramkhanbaloch/media3/MS/Semester1/DS/results/small_cities_test_city_Karlsruhe_.csv
/home/bahramkhanbaloch/media3/MS/Semester1/DS/results/big_cities_test_city_Frankfurt_.csv
                     Model  Average AUC
1              BernoulliNB     0.784188
13         NearestCentroid     0.767735
8             ComplementNB     0.661513
4            MultinomialNB     0.657249


In [23]:
top_5_models

['BernoulliNB',
 'NearestCentroid',
 'ComplementNB',
 'MultinomialNB',
 'DecisionTreeClassifier']

In [31]:
import pandas as pd
import glob

# Get a list of all result files from different experiments
result_files = glob.glob("/home/bahramkhanbaloch/media3/MS/Semester1/DS/results/*.csv")

# Create an empty DataFrame to store the combined results
combined_results = pd.DataFrame()

# Create an empty DataFrame to store the summary
summary_results = pd.DataFrame(columns=['type_city'])

# Iterate over each result file
for type_city in ['big', 'small', 'all']:
    # Reset combined_results for each type_city iteration
    combined_results = pd.DataFrame()

    # Iterate over each result file
    for file in result_files:
        # Load the results for each experiment
        if type_city in file:
            results = pd.read_csv(file)
            
            # Append the results to the combined DataFrame
            combined_results = combined_results.append(results)

    # Calculate the average AUC for each model
    average_auc_per_model = combined_results.groupby('Model')['AUC'].mean()
    
    # Sort the models by average AUC in descending order
    sorted_models = average_auc_per_model.sort_values(ascending=False)
    
    # Filter the results to include only the rows corresponding to the top 5 models
    filtered_results = combined_results[combined_results['Model'].isin(top_5_models)]

    # Calculate the average AUC for each model
    average_auc_by_model = filtered_results.groupby('Model')['AUC'].mean()
    
    # Create a row with type_city and average AUC values for each model
    row = {'type_city': type_city}
    row.update(average_auc_by_model)
    
    # Append the row to the summary_results DataFrame
    summary_results = summary_results.append(row, ignore_index=True)

# Display the summary_results DataFrame
print(summary_results)


  type_city  BernoulliNB  ComplementNB  DecisionTreeClassifier  MultinomialNB  \
0       big     0.764115      0.669090                0.621317       0.653853   
1     small     0.810223      0.654498                0.681870       0.666221   
2       all     0.810952      0.651410                0.672189       0.661778   

   NearestCentroid  
0         0.715620  
1         0.853362  
2         0.837221  


  combined_results = combined_results.append(results)
  combined_results = combined_results.append(results)
  combined_results = combined_results.append(results)
  combined_results = combined_results.append(results)
  summary_results = summary_results.append(row, ignore_index=True)
  combined_results = combined_results.append(results)
  combined_results = combined_results.append(results)
  combined_results = combined_results.append(results)
  combined_results = combined_results.append(results)
  summary_results = summary_results.append(row, ignore_index=True)
  combined_results = combined_results.append(results)
  combined_results = combined_results.append(results)
  combined_results = combined_results.append(results)
  combined_results = combined_results.append(results)
  combined_results = combined_results.append(results)
  summary_results = summary_results.append(row, ignore_index=True)


In [32]:
summary_results

Unnamed: 0,type_city,BernoulliNB,ComplementNB,DecisionTreeClassifier,MultinomialNB,NearestCentroid
0,big,0.764115,0.66909,0.621317,0.653853,0.71562
1,small,0.810223,0.654498,0.68187,0.666221,0.853362
2,all,0.810952,0.65141,0.672189,0.661778,0.837221


In [40]:
import pandas as pd
import glob

# Get a list of all result files from different experiments
result_files = glob.glob("/home/bahramkhanbaloch/media3/MS/Semester1/DS/results/*.csv")

# Create an empty DataFrame to store the combined results
combined_results = pd.DataFrame()

# Create an empty DataFrame to store the summary
summary_results = pd.DataFrame(columns=['type_city', 'AUC', 'Accuracy', 'Precision', 'Recall'])

# Iterate over each result file
for type_city in ['big', 'small', 'all']:
    # Reset combined_results for each type_city iteration
    combined_results = pd.DataFrame()

    # Iterate over each result file
    for file in result_files:
        # Load the results for each experiment
        if type_city in file:
            results = pd.read_csv(file)
            
            # Append the results to the combined DataFrame
            combined_results = combined_results.append(results)

    # Filter the results to include only the rows corresponding to the top 5 models
    filtered_results = combined_results[combined_results['Model'].isin(top_5_models)]

    # Calculate the average values for each metric
    average_metrics_per_model = filtered_results.groupby('Model')['AUC', 'Accuracy', 'Precision', 'Recall'].mean()

    # Calculate the average values for each metric
    average_values = average_metrics_per_model.mean()

    # Create a row with type_city, average values for each metric
    row = {'type_city': type_city}
    for metric in ['AUC', 'Accuracy', 'Precision', 'Recall']:
        row[metric] = average_values[metric]

    # Append the row to the summary_results DataFrame
    summary_results = summary_results.append(row, ignore_index=True)

# Display the summary_results DataFrame
summary_results


  combined_results = combined_results.append(results)
  combined_results = combined_results.append(results)
  combined_results = combined_results.append(results)
  combined_results = combined_results.append(results)
  average_metrics_per_model = filtered_results.groupby('Model')['AUC', 'Accuracy', 'Precision', 'Recall'].mean()
  summary_results = summary_results.append(row, ignore_index=True)
  combined_results = combined_results.append(results)
  combined_results = combined_results.append(results)
  combined_results = combined_results.append(results)
  combined_results = combined_results.append(results)
  average_metrics_per_model = filtered_results.groupby('Model')['AUC', 'Accuracy', 'Precision', 'Recall'].mean()
  summary_results = summary_results.append(row, ignore_index=True)
  combined_results = combined_results.append(results)
  combined_results = combined_results.append(results)
  combined_results = combined_results.append(results)
  combined_results = combined_results.append(r

Unnamed: 0,type_city,AUC,Accuracy,Precision,Recall
0,big,0.684799,0.813313,0.6492,0.684799
1,small,0.733235,0.884658,0.617062,0.733235
2,all,0.72671,0.876493,0.62105,0.72671


In [46]:
top_5_models

['BernoulliNB',
 'NearestCentroid',
 'ComplementNB',
 'MultinomialNB',
 'DecisionTreeClassifier']

In [45]:
summary_results.to_csv("/home/bahramkhanbaloch/media3/MS/Semester1/DS/results/summary_results.csv", index=False)