In [86]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


### Analysing feature groups

In [84]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, make_scorer, roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from xgboost import XGBClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier

from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.model_selection import GridSearchCV

from sklearn.feature_selection import SequentialFeatureSelector

import warnings
warnings.filterwarnings(action='ignore')

In [88]:
df = pd.read_csv('/content/drive/MyDrive/학교/Dissertation/Dissertation_final/Data Preprocessing/all_features_scaled.csv')
df

Unnamed: 0,title,page_id,char_count,word_count,sentence_count,syllable_count,complex_word_count,section_count,subsection_count,paragraph_count,...,num_edits,num_editors,num_registered_editors,num_anonymous_editors,num_occasional_editors,edit_per_day,edit_per_editor,revert_count,discussion_count,quality
0,Mayan languages,182013,0.554382,0.490641,0.350556,0.563203,0.727204,0.313583,1.614782,0.765594,...,0.202138,0.020507,0.065268,-0.021923,0.032017,0.236123,1.059491,0.210672,0.171081,1
1,Mu'awiya I,207068,1.902108,1.915008,1.614977,1.920173,1.860678,0.660305,0.804225,0.622956,...,0.706190,0.543133,0.438547,0.622440,0.507665,0.775288,0.832776,0.779877,0.635288,1
2,The Fountainhead,180464,0.527779,0.520419,0.849882,0.568074,0.666285,0.140222,1.209503,0.123721,...,0.125496,0.281341,0.104355,0.436776,0.319459,0.155290,-0.143283,-0.137176,-0.065684,1
3,Northern pintail,218361,-0.519528,-0.520312,-0.575612,-0.539584,-0.622258,-0.379860,-0.715570,-0.518153,...,-0.428819,-0.416772,-0.294330,-0.517026,-0.407130,-0.422839,-0.232880,-0.401317,-0.218555,1
4,Manhattan Project,19603,3.393837,3.347316,3.580064,3.595369,4.328846,2.567274,2.222700,2.358391,...,1.586678,1.760994,1.658053,1.798311,1.830809,1.527498,0.269954,1.507195,0.520634,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5195,Party of Democratic Kampuchea,265468,-1.022361,-1.048866,-1.042723,-1.020531,-0.919472,-1.420025,-0.918209,-0.993615,...,-0.669082,-0.726511,-0.765325,-0.666285,-0.735635,-0.677037,-0.678444,-0.555709,-0.257705,0
5196,Minawara and Multultu,95240,-1.089358,-1.108422,-1.074937,-1.095216,-1.028389,-1.593386,-0.918209,-1.136253,...,-0.688688,-0.749526,-0.792686,-0.684488,-0.755026,-0.698471,-1.323641,-0.559429,-0.266094,0
5197,Theophylline/ephedra/hydroxyzine,262652,-1.095542,-1.115618,-1.082991,-1.101530,-1.032081,-1.593386,-0.918209,-1.136253,...,-0.684766,-0.744731,-0.786822,-0.680847,-0.751604,-0.693822,-1.131271,-0.557569,-0.266094,0
5198,"Channel Lake, Illinois",111450,-1.004166,-1.005192,-0.986347,-1.010428,-0.971161,-1.246665,-0.715570,-0.993615,...,-0.675142,-0.732265,-0.759462,-0.682668,-0.740197,-0.684439,-0.865996,-0.561290,-0.263298,0


In [89]:
def train_and_evaluate_models(X, y, cv=5, test_size=0.2):
    # Split the dataset into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42, stratify=y)

    # Initialize the models
    models = {
        'Logistic Regression': LogisticRegression(),
        'SVM': SVC(),
        'Random Forest': RandomForestClassifier(),
        'AdaBoost': AdaBoostClassifier(),
        'XGBoost': XGBClassifier(),
        'MLP': MLPClassifier(),
        'KNN': KNeighborsClassifier()
    }

    # Initialize a list to store the results
    results = []

    # Define custom scoring
    scoring = {
        'accuracy': make_scorer(accuracy_score),
        'precision': make_scorer(precision_score, average='weighted'),
        'recall': make_scorer(recall_score, average='weighted'),
        'f1': make_scorer(f1_score, average='weighted')
    }

    # StratifiedKFold for cross-validation
    skf = StratifiedKFold(n_splits=cv, shuffle=True, random_state=42)

    # Loop through the models
    for model_name, model in models.items():
        # Perform cross-validation on the training set
        cv_results = {metric: cross_val_score(model, X_train, y_train, cv=skf, scoring=scorer).mean() for metric, scorer in scoring.items()}

        # Train the model on the full training set
        model.fit(X_train, y_train)

        # Predict the test set
        y_pred = model.predict(X_test)

        # Calculate the metrics on the test set
        test_accuracy = accuracy_score(y_test, y_pred)
        test_precision = precision_score(y_test, y_pred, average='weighted')
        test_recall = recall_score(y_test, y_pred, average='weighted')
        test_f1 = f1_score(y_test, y_pred, average='weighted')

        # Append the results to the list
        results.append({
            'Model': model_name,
            'CV Accuracy': cv_results['accuracy'],
            'CV Precision': cv_results['precision'],
            'CV Recall': cv_results['recall'],
            'CV F1 Score': cv_results['f1'],
            'Test Accuracy': test_accuracy,
            'Test Precision': test_precision,
            'Test Recall': test_recall,
            'Test F1 Score': test_f1
        })

    # Create a dataframe from the results
    results_df = pd.DataFrame(results)

    return results_df

In [90]:
def sequential_backward_feature_selection(X, y, model):

    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

    # Backward feature selection
    backward_selector = SequentialFeatureSelector(model, direction='backward')
    backward_selector.fit(X_train, y_train)
    backward_selected_features = X.columns[backward_selector.get_support()]

    print(f"Selected Features: {backward_selected_features}")

    return backward_selected_features

In [None]:
# Results when all features are included
X = df.iloc[:, 2:-1]
y = df['quality']

result1 = train_and_evaluate_models(X, y)
result1

Unnamed: 0,Model,CV Accuracy,CV Precision,CV Recall,CV F1 Score,Test Accuracy,Test Precision,Test Recall,Test F1 Score
0,Logistic Regression,0.884375,0.884825,0.884375,0.884342,0.895192,0.895967,0.895192,0.895141
1,SVM,0.890144,0.890998,0.890144,0.890083,0.899038,0.901069,0.899038,0.898911
2,Random Forest,0.894952,0.89045,0.895192,0.893932,0.904808,0.905469,0.904808,0.904769
3,AdaBoost,0.88774,0.888689,0.88774,0.887671,0.888462,0.888743,0.888462,0.888441
4,XGBoost,0.902644,0.903332,0.902644,0.902601,0.900962,0.901391,0.900962,0.900935
5,MLP,0.898317,0.895615,0.897837,0.898277,0.895192,0.895311,0.895192,0.895184
6,KNN,0.866346,0.870768,0.866346,0.865918,0.890385,0.894965,0.890385,0.890066


#### Including only one feature group

In [None]:
df_length = df.iloc[:, list(range(2, 7)) + [-1]]
df_structure = df.iloc[:, list(range(7, 24)) + [-1]]
df_style = df.iloc[:, list(range(24, 31)) + [-1]]
df_readability = df.iloc[:, list(range(31, 38)) + [-1]]
df_edit = df.iloc[:, 38:]

__length features__

In [None]:
# Results when only length features are included
X = df_length.iloc[:, :-1]
y = df_length['quality']

result_length = train_and_evaluate_models(X, y)
result_length

Unnamed: 0,Model,CV Accuracy,CV Precision,CV Recall,CV F1 Score,Test Accuracy,Test Precision,Test Recall,Test F1 Score
0,Logistic Regression,0.782692,0.786362,0.782692,0.781999,0.798077,0.802375,0.798077,0.797357
1,SVM,0.790144,0.794048,0.790144,0.789441,0.795192,0.798782,0.795192,0.794575
2,Random Forest,0.786058,0.786801,0.7875,0.784997,0.791346,0.792385,0.791346,0.791161
3,AdaBoost,0.788702,0.797552,0.788702,0.787072,0.7875,0.795393,0.7875,0.786071
4,XGBoost,0.77524,0.77819,0.77524,0.774667,0.794231,0.795213,0.794231,0.794059
5,MLP,0.798077,0.800395,0.798077,0.798975,0.811538,0.815463,0.811538,0.81095
6,KNN,0.773317,0.775975,0.773317,0.772772,0.791346,0.792672,0.791346,0.79111


In [None]:
# Backward feature selection when only length features are included
X = df_length.iloc[:, :-1]
y = df_length['quality']

model_mlp = MLPClassifier()
sequential_backward_feature_selection(X, y, model_mlp)

Selected Features: Index(['word_count', 'sentence_count'], dtype='object')


Index(['word_count', 'sentence_count'], dtype='object')

In [None]:
# Results with selected features
X = df_length[['word_count', 'sentence_count']]
y = df_length['quality']

length_selected = train_and_evaluate_models(X, y)
length_selected

Unnamed: 0,Model,CV Accuracy,CV Precision,CV Recall,CV F1 Score,Test Accuracy,Test Precision,Test Recall,Test F1 Score
0,Logistic Regression,0.769712,0.77324,0.769712,0.768956,0.789423,0.793198,0.789423,0.788743
1,SVM,0.787019,0.791453,0.787019,0.786194,0.799038,0.803785,0.799038,0.79825
2,Random Forest,0.75649,0.75646,0.75601,0.757799,0.770192,0.770554,0.770192,0.770116
3,AdaBoost,0.788942,0.801179,0.788942,0.786737,0.783654,0.797196,0.783654,0.781161
4,XGBoost,0.776683,0.778658,0.776683,0.776293,0.772115,0.772479,0.772115,0.772039
5,MLP,0.798077,0.801302,0.798077,0.796272,0.805769,0.811412,0.805769,0.804885
6,KNN,0.766346,0.767716,0.766346,0.766053,0.775962,0.776708,0.775962,0.77581


__structure features__

In [None]:
# Results when only structure features are included
X = df_structure.iloc[:, :-1]
y = df_structure['quality']

result_structure = train_and_evaluate_models(X, y)
result_structure

Unnamed: 0,Model,CV Accuracy,CV Precision,CV Recall,CV F1 Score,Test Accuracy,Test Precision,Test Recall,Test F1 Score
0,Logistic Regression,0.859615,0.860197,0.859615,0.859561,0.867308,0.867357,0.867308,0.867303
1,SVM,0.880048,0.881873,0.880048,0.879903,0.879808,0.88321,0.879808,0.87954
2,Random Forest,0.884856,0.885062,0.886298,0.883118,0.888462,0.888743,0.888462,0.888441
3,AdaBoost,0.876202,0.877359,0.876202,0.876104,0.881731,0.883092,0.881731,0.881626
4,XGBoost,0.886538,0.887396,0.886538,0.88647,0.873077,0.873276,0.873077,0.87306
5,MLP,0.886298,0.887142,0.888702,0.885506,0.870192,0.871347,0.870192,0.870091
6,KNN,0.861538,0.864531,0.861538,0.861239,0.857692,0.861592,0.857692,0.857308


In [None]:
# Backward feature selection when only structure features are included
X = df_structure.iloc[:, :-1]
y = df_structure['quality']

model_xgb = XGBClassifier()
sequential_backward_feature_selection(X, y, model_xgb)

Selected Features: Index(['mean_paragraph_size', 'abstract_text_ratio', 'citation_per_text',
       'external_link_count', 'internal_link_count', 'link_per_text',
       'image_count', 'infobox'],
      dtype='object')


Index(['mean_paragraph_size', 'abstract_text_ratio', 'citation_per_text',
       'external_link_count', 'internal_link_count', 'link_per_text',
       'image_count', 'infobox'],
      dtype='object')

In [None]:
# Results with selected features
X = df_structure[['mean_paragraph_size', 'abstract_text_ratio', 'citation_per_text',
                  'external_link_count', 'internal_link_count', 'link_per_text',
                  'image_count', 'infobox']]
y = df_structure['quality']

structure_selected = train_and_evaluate_models(X, y)
structure_selected

Unnamed: 0,Model,CV Accuracy,CV Precision,CV Recall,CV F1 Score,Test Accuracy,Test Precision,Test Recall,Test F1 Score
0,Logistic Regression,0.839183,0.839535,0.839183,0.83914,0.854808,0.855282,0.854808,0.854759
1,SVM,0.867548,0.869606,0.867548,0.867361,0.876923,0.879641,0.876923,0.876702
2,Random Forest,0.880529,0.878752,0.88149,0.880211,0.877885,0.878121,0.877885,0.877866
3,AdaBoost,0.873317,0.874389,0.873317,0.873225,0.870192,0.870687,0.870192,0.870149
4,XGBoost,0.875721,0.876673,0.875721,0.875639,0.871154,0.871704,0.871154,0.871106
5,MLP,0.879808,0.882882,0.880769,0.880899,0.883654,0.885022,0.883654,0.88355
6,KNN,0.849519,0.852043,0.849519,0.849249,0.861538,0.864646,0.861538,0.861243


__style features__

In [None]:
# Results when only style features are included
X = df_style.iloc[:, :-1]
y = df_style['quality']

result_style = train_and_evaluate_models(X, y)
result_style

Unnamed: 0,Model,CV Accuracy,CV Precision,CV Recall,CV F1 Score,Test Accuracy,Test Precision,Test Recall,Test F1 Score
0,Logistic Regression,0.692548,0.702533,0.692548,0.688728,0.702885,0.711508,0.702885,0.699825
1,SVM,0.725721,0.737326,0.725721,0.722389,0.749038,0.760928,0.749038,0.746147
2,Random Forest,0.767788,0.768101,0.765625,0.763277,0.767308,0.767628,0.767308,0.767238
3,AdaBoost,0.760337,0.762061,0.760337,0.759951,0.766346,0.767558,0.766346,0.766081
4,XGBoost,0.756731,0.757236,0.756731,0.756614,0.763462,0.763477,0.763462,0.763458
5,MLP,0.766106,0.767767,0.767067,0.767986,0.767308,0.767324,0.767308,0.767304
6,KNN,0.726923,0.728153,0.726923,0.726565,0.744231,0.746157,0.744231,0.743729


In [None]:
# Backward feature selection when only style features are included
X = df_style.iloc[:, :-1]
y = df_style['quality']

model_rf = RandomForestClassifier()
sequential_backward_feature_selection(X, y, model_rf)

Selected Features: Index(['shortest_sentence_size', 'pronoun_start_count',
       'pronoun_start_count_sentence_ratio'],
      dtype='object')


Index(['shortest_sentence_size', 'pronoun_start_count',
       'pronoun_start_count_sentence_ratio'],
      dtype='object')

In [None]:
# Results with selected features
X = df_style[['shortest_sentence_size', 'pronoun_start_count', 'pronoun_start_count_sentence_ratio']]
y = df_style['quality']

style_selected = train_and_evaluate_models(X, y)
style_selected

Unnamed: 0,Model,CV Accuracy,CV Precision,CV Recall,CV F1 Score,Test Accuracy,Test Precision,Test Recall,Test F1 Score
0,Logistic Regression,0.685096,0.693331,0.685096,0.681753,0.685577,0.692852,0.685577,0.682583
1,SVM,0.710817,0.720897,0.710817,0.707485,0.728846,0.74127,0.728846,0.72531
2,Random Forest,0.733413,0.737144,0.731971,0.735254,0.730769,0.7308,0.730769,0.73076
3,AdaBoost,0.751202,0.751701,0.751202,0.75108,0.754808,0.755497,0.754808,0.754642
4,XGBoost,0.747837,0.74796,0.747837,0.747805,0.732692,0.732723,0.732692,0.732683
5,MLP,0.741346,0.744593,0.738702,0.74435,0.756731,0.757326,0.756731,0.75659
6,KNN,0.7375,0.737845,0.7375,0.737406,0.729808,0.730525,0.729808,0.729597


__readability features__

In [None]:
# Results when only readability features are included
X = df_readability.iloc[:, :-1]
y = df_readability['quality']

result_readability = train_and_evaluate_models(X, y)
result_readability

Unnamed: 0,Model,CV Accuracy,CV Precision,CV Recall,CV F1 Score,Test Accuracy,Test Precision,Test Recall,Test F1 Score
0,Logistic Regression,0.748558,0.755418,0.748558,0.746857,0.750962,0.760009,0.750962,0.748776
1,SVM,0.752885,0.774554,0.752885,0.747928,0.761538,0.783261,0.761538,0.756877
2,Random Forest,0.745913,0.748166,0.750721,0.74252,0.723077,0.726729,0.723077,0.721957
3,AdaBoost,0.723077,0.733386,0.723077,0.719989,0.708654,0.719384,0.708654,0.705047
4,XGBoost,0.73726,0.740252,0.73726,0.736451,0.710577,0.713515,0.710577,0.709578
5,MLP,0.780288,0.783103,0.77476,0.778419,0.779808,0.785752,0.779808,0.778657
6,KNN,0.710337,0.721742,0.710337,0.706492,0.719231,0.730299,0.719231,0.715816


In [None]:
# Backward feature selection when only readability features are included
X = df_readability.iloc[:, :-1]
y = df_readability['quality']

model_mlp = MLPClassifier()
sequential_backward_feature_selection(X, y, model_mlp)

Selected Features: Index(['FRE', 'FKG', 'GFI'], dtype='object')


Index(['FRE', 'FKG', 'GFI'], dtype='object')

In [None]:
# Results with selected features
X = df_readability[['FRE', 'FKG', 'GFI']]
y = df_readability['quality']

readability_selected = train_and_evaluate_models(X, y)
readability_selected

Unnamed: 0,Model,CV Accuracy,CV Precision,CV Recall,CV F1 Score,Test Accuracy,Test Precision,Test Recall,Test F1 Score
0,Logistic Regression,0.696875,0.698937,0.696875,0.696077,0.699038,0.701459,0.699038,0.698132
1,SVM,0.733894,0.75591,0.733894,0.728061,0.725,0.747253,0.725,0.71867
2,Random Forest,0.692308,0.695043,0.695433,0.689861,0.674038,0.67407,0.674038,0.674024
3,AdaBoost,0.721875,0.739206,0.721875,0.716843,0.7125,0.729364,0.7125,0.707116
4,XGBoost,0.713942,0.718547,0.713942,0.712418,0.690385,0.695,0.690385,0.688542
5,MLP,0.738462,0.74656,0.740385,0.734248,0.722115,0.731186,0.722115,0.719363
6,KNN,0.696635,0.698578,0.696635,0.695857,0.701923,0.703658,0.701923,0.701287


__edit features__

In [None]:
# Results when only edit features are included
X = df_edit.iloc[:, :-1]
y = df_edit['quality']

result_edit = train_and_evaluate_models(X, y)
result_edit

Unnamed: 0,Model,CV Accuracy,CV Precision,CV Recall,CV F1 Score,Test Accuracy,Test Precision,Test Recall,Test F1 Score
0,Logistic Regression,0.797115,0.79723,0.797115,0.797097,0.820192,0.820203,0.820192,0.820191
1,SVM,0.804087,0.81062,0.804087,0.803056,0.821154,0.826741,0.821154,0.820386
2,Random Forest,0.815144,0.823469,0.81875,0.81982,0.838462,0.840476,0.838462,0.838222
3,AdaBoost,0.820192,0.824368,0.820192,0.819612,0.839423,0.845153,0.839423,0.838754
4,XGBoost,0.817788,0.820031,0.817788,0.817477,0.844231,0.84628,0.844231,0.844
5,MLP,0.813221,0.81857,0.815865,0.814024,0.831731,0.836359,0.831731,0.83115
6,KNN,0.779567,0.782573,0.779567,0.778955,0.817308,0.821031,0.817308,0.816776


In [None]:
# Backward feature selection when only edit features are included
X = df_edit.iloc[:, :-1]
y = df_edit['quality']

model_ada = AdaBoostClassifier()
sequential_backward_feature_selection(X, y, model_ada)

Selected Features: Index(['article_age_days', 'num_anonymous_editors', 'edit_per_editor',
       'revert_count', 'discussion_count'],
      dtype='object')


Index(['article_age_days', 'num_anonymous_editors', 'edit_per_editor',
       'revert_count', 'discussion_count'],
      dtype='object')

In [None]:
# Results with selected features
X = df_edit[['article_age_days', 'num_anonymous_editors', 'edit_per_editor', 'revert_count', 'discussion_count']]
y = df_edit['quality']

edit_selected = train_and_evaluate_models(X, y)
edit_selected

Unnamed: 0,Model,CV Accuracy,CV Precision,CV Recall,CV F1 Score,Test Accuracy,Test Precision,Test Recall,Test F1 Score
0,Logistic Regression,0.791106,0.791702,0.791106,0.790998,0.805769,0.806222,0.805769,0.805697
1,SVM,0.801683,0.80723,0.801683,0.800781,0.832692,0.83814,0.832692,0.832016
2,Random Forest,0.813462,0.821893,0.81274,0.816174,0.828846,0.832168,0.828846,0.828417
3,AdaBoost,0.821635,0.825894,0.821635,0.821054,0.840385,0.846667,0.840385,0.839658
4,XGBoost,0.815144,0.817306,0.815144,0.814829,0.829808,0.831141,0.829808,0.829636
5,MLP,0.80649,0.814075,0.809856,0.808525,0.830769,0.835539,0.830769,0.830166
6,KNN,0.779808,0.78241,0.779808,0.779293,0.802885,0.80517,0.802885,0.802515


#### Excluding feature groups

In [None]:
df_ex = df.copy()
df_length_ex = df_ex.drop(df_ex.columns[2:7], axis=1)
df_structure_ex = df_ex.drop(df_ex.columns[7:24], axis=1)
df_style_ex = df_ex.drop(df_ex.columns[24:31], axis=1)
df_readability_ex = df_ex.drop(df_ex.columns[31:38], axis=1)
df_edit_ex = df_ex.drop(df_ex.columns[38:48], axis=1)

In [None]:
# Results when only length features are excluded
X = df_length_ex.iloc[:, 2:-1]
y = df_length_ex['quality']

result_length_ex = train_and_evaluate_models(X, y)
result_length_ex

Unnamed: 0,Model,CV Accuracy,CV Precision,CV Recall,CV F1 Score,Test Accuracy,Test Precision,Test Recall,Test F1 Score
0,Logistic Regression,0.881731,0.882276,0.881731,0.88169,0.892308,0.892778,0.892308,0.892275
1,SVM,0.889183,0.890088,0.889183,0.889118,0.899038,0.901069,0.899038,0.898911
2,Random Forest,0.891346,0.89782,0.894712,0.892006,0.904808,0.905349,0.904808,0.904776
3,AdaBoost,0.88774,0.888789,0.88774,0.887667,0.883654,0.883894,0.883654,0.883636
4,XGBoost,0.902644,0.903565,0.902644,0.902587,0.896154,0.896177,0.896154,0.896152
5,MLP,0.898317,0.896488,0.900481,0.897294,0.883654,0.884851,0.883654,0.883563
6,KNN,0.867308,0.871969,0.867308,0.866872,0.8875,0.891264,0.8875,0.887229


In [None]:
# Results when only structure features are excluded
X = df_structure_ex.iloc[:, 2:-1]
y = df_structure_ex['quality']

result_structure_ex = train_and_evaluate_models(X, y)
result_structure_ex

Unnamed: 0,Model,CV Accuracy,CV Precision,CV Recall,CV F1 Score,Test Accuracy,Test Precision,Test Recall,Test F1 Score
0,Logistic Regression,0.847115,0.847937,0.847115,0.847023,0.861538,0.862187,0.861538,0.861476
1,SVM,0.85625,0.858498,0.85625,0.856031,0.873077,0.87508,0.873077,0.872907
2,Random Forest,0.859615,0.861078,0.857452,0.859006,0.868269,0.869758,0.868269,0.868136
3,AdaBoost,0.853125,0.854215,0.853125,0.853011,0.859615,0.860816,0.859615,0.859498
4,XGBoost,0.858173,0.859258,0.858173,0.858066,0.868269,0.868991,0.868269,0.868205
5,MLP,0.869712,0.865334,0.862981,0.86558,0.879808,0.879978,0.879808,0.879794
6,KNN,0.819231,0.822985,0.819231,0.818694,0.843269,0.846873,0.843269,0.842861


In [None]:
# Results when only style features are excluded
X = df_style_ex.iloc[:, 2:-1]
y = df_style_ex['quality']

result_style_ex = train_and_evaluate_models(X, y)
result_style_ex

Unnamed: 0,Model,CV Accuracy,CV Precision,CV Recall,CV F1 Score,Test Accuracy,Test Precision,Test Recall,Test F1 Score
0,Logistic Regression,0.883654,0.884121,0.883654,0.883619,0.888462,0.889037,0.888462,0.88842
1,SVM,0.890144,0.891036,0.890144,0.890081,0.904808,0.90665,0.904808,0.9047
2,Random Forest,0.89351,0.896009,0.893269,0.895155,0.893269,0.893795,0.893269,0.893234
3,AdaBoost,0.891106,0.891817,0.891106,0.891057,0.889423,0.889944,0.889423,0.889386
4,XGBoost,0.903365,0.904362,0.903365,0.903301,0.894231,0.894704,0.894231,0.894199
5,MLP,0.902644,0.896559,0.897837,0.898773,0.886538,0.886544,0.886538,0.886538
6,KNN,0.864663,0.869478,0.864663,0.864213,0.885577,0.889624,0.885577,0.885279


In [None]:
# Results when only readability features are excluded
X = df_readability_ex.iloc[:, 2:-1]
y = df_readability_ex['quality']

result_readability_ex = train_and_evaluate_models(X, y)
result_readability_ex

Unnamed: 0,Model,CV Accuracy,CV Precision,CV Recall,CV F1 Score,Test Accuracy,Test Precision,Test Recall,Test F1 Score
0,Logistic Regression,0.88125,0.881567,0.88125,0.881227,0.891346,0.891765,0.891346,0.891317
1,SVM,0.889663,0.890832,0.889663,0.889582,0.898077,0.900214,0.898077,0.897941
2,Random Forest,0.899279,0.892405,0.893029,0.893708,0.895192,0.895615,0.895192,0.895164
3,AdaBoost,0.887981,0.889014,0.887981,0.887904,0.886538,0.886905,0.886538,0.886512
4,XGBoost,0.900962,0.90169,0.900962,0.900916,0.897115,0.897646,0.897115,0.897081
5,MLP,0.90024,0.905241,0.898317,0.899242,0.891346,0.891765,0.891346,0.891317
6,KNN,0.857692,0.861548,0.857692,0.857308,0.869231,0.87296,0.869231,0.868903


In [None]:
# Results when only edit features are excluded
X = df_edit_ex.iloc[:, 2:-1]
y = df_edit_ex['quality']

result_edit_ex = train_and_evaluate_models(X, y)
result_edit_ex

Unnamed: 0,Model,CV Accuracy,CV Precision,CV Recall,CV F1 Score,Test Accuracy,Test Precision,Test Recall,Test F1 Score
0,Logistic Regression,0.872356,0.873145,0.872356,0.872291,0.877885,0.878121,0.877885,0.877866
1,SVM,0.881731,0.882779,0.881731,0.881649,0.884615,0.886077,0.884615,0.884506
2,Random Forest,0.88726,0.88896,0.886538,0.887924,0.883654,0.883769,0.883654,0.883645
3,AdaBoost,0.875721,0.876631,0.875721,0.875646,0.883654,0.883769,0.883654,0.883645
4,XGBoost,0.891587,0.89241,0.891587,0.891526,0.894231,0.894604,0.894231,0.894206
5,MLP,0.88726,0.885705,0.885096,0.886477,0.883654,0.884167,0.883654,0.883615
6,KNN,0.860817,0.864936,0.860817,0.860399,0.858654,0.862712,0.858654,0.858257


### Only including features selected overall

In [None]:
# Results with all selected features
X = df[['word_count', 'sentence_count', 'mean_paragraph_size', 'abstract_text_ratio', 'citation_per_text',
               'external_link_count', 'internal_link_count', 'link_per_text', 'image_count', 'infobox',
               'shortest_sentence_size', 'pronoun_start_count', 'pronoun_start_count_sentence_ratio',
               'FRE', 'FKG', 'GFI', 'article_age_days', 'num_anonymous_editors', 'edit_per_editor', 'revert_count', 'discussion_count']]
y = df_length['quality']

all_selected = train_and_evaluate_models(X, y)
all_selected

Unnamed: 0,Model,CV Accuracy,CV Precision,CV Recall,CV F1 Score,Test Accuracy,Test Precision,Test Recall,Test F1 Score
0,Logistic Regression,0.867067,0.86774,0.867067,0.867007,0.885577,0.885692,0.885577,0.885568
1,SVM,0.885337,0.886492,0.885337,0.885249,0.9,0.902148,0.9,0.899866
2,Random Forest,0.889183,0.895608,0.894471,0.894658,0.899038,0.89969,0.899038,0.898997
3,AdaBoost,0.886779,0.887808,0.886779,0.886704,0.890385,0.890853,0.890385,0.890352
4,XGBoost,0.899038,0.900154,0.899038,0.898968,0.905769,0.906154,0.905769,0.905747
5,MLP,0.893029,0.895338,0.894471,0.892981,0.900962,0.902392,0.900962,0.900873
6,KNN,0.859856,0.862615,0.859856,0.85958,0.865385,0.869368,0.865385,0.865021


### Combinations of feature groups

__Cominations of two feature groups__

In [None]:
length_structure = df.iloc[:, list(range(2, 24)) + [-1]]
length_style = df.iloc[:, list(range(2, 7))+list(range(24, 31)) + [-1]]
length_readability = df.iloc[:, list(range(2, 7))+list(range(31, 38)) + [-1]]
length_edit = df.iloc[:, list(range(2, 7))+list(range(38, 48)) + [-1]]
structure_style = df.iloc[:, list(range(7, 24))+list(range(24, 31)) + [-1]]
structure_readability = df.iloc[:, list(range(7, 24))+list(range(31, 38)) + [-1]]
structure_edit = df.iloc[:, list(range(7, 24))+list(range(38, 48)) + [-1]]
style_readability = df.iloc[:, list(range(24, 31))+list(range(31, 38)) + [-1]]
style_edit = df.iloc[:, list(range(24, 31))+list(range(38, 48)) + [-1]]
readability_edit = df.iloc[:, list(range(31, 38))+list(range(38, 48)) + [-1]]

In [None]:
# Results of length+structure
X = length_structure.iloc[:, :-1]
y = length_structure['quality']

result_length_structure = train_and_evaluate_models(X, y)
result_length_structure

Unnamed: 0,Model,CV Accuracy,CV Precision,CV Recall,CV F1 Score,Test Accuracy,Test Precision,Test Recall,Test F1 Score
0,Logistic Regression,0.870192,0.870965,0.870192,0.870126,0.866346,0.866575,0.866346,0.866325
1,SVM,0.88125,0.882637,0.88125,0.881142,0.8875,0.889263,0.8875,0.887372
2,Random Forest,0.888221,0.892573,0.890144,0.887211,0.874038,0.87435,0.874038,0.874012
3,AdaBoost,0.873558,0.87466,0.873558,0.873464,0.881731,0.882922,0.881731,0.881639
4,XGBoost,0.885577,0.886345,0.885577,0.885512,0.883654,0.883973,0.883654,0.88363
5,MLP,0.894712,0.895092,0.890625,0.893232,0.876923,0.87728,0.876923,0.876894
6,KNN,0.8625,0.865856,0.8625,0.86217,0.864423,0.867963,0.864423,0.864096


In [None]:
# Results of length+style
X = length_style.iloc[:, :-1]
y = length_style['quality']

result_length_style = train_and_evaluate_models(X, y)
result_length_style

Unnamed: 0,Model,CV Accuracy,CV Precision,CV Recall,CV F1 Score,Test Accuracy,Test Precision,Test Recall,Test F1 Score
0,Logistic Regression,0.784375,0.787723,0.784375,0.783741,0.8,0.804048,0.8,0.799332
1,SVM,0.814904,0.817719,0.814904,0.814494,0.822115,0.826316,0.822115,0.821541
2,Random Forest,0.820673,0.821517,0.820192,0.819717,0.818269,0.818695,0.818269,0.818209
3,AdaBoost,0.81274,0.815107,0.81274,0.812396,0.819231,0.82221,0.819231,0.818812
4,XGBoost,0.811298,0.812833,0.811298,0.811066,0.824038,0.824308,0.824038,0.824002
5,MLP,0.826202,0.822122,0.825962,0.822153,0.827885,0.82921,0.827885,0.827711
6,KNN,0.796875,0.798905,0.796875,0.796532,0.798077,0.800428,0.798077,0.797681


In [None]:
# Results of length+readability
X = length_readability.iloc[:, :-1]
y = length_readability['quality']

result_length_readability = train_and_evaluate_models(X, y)
result_length_readability

Unnamed: 0,Model,CV Accuracy,CV Precision,CV Recall,CV F1 Score,Test Accuracy,Test Precision,Test Recall,Test F1 Score
0,Logistic Regression,0.806971,0.807323,0.806971,0.80692,0.831731,0.831791,0.831731,0.831723
1,SVM,0.80601,0.812888,0.80601,0.804927,0.818269,0.824668,0.818269,0.817369
2,Random Forest,0.796635,0.802176,0.798798,0.798422,0.813462,0.816155,0.813462,0.813063
3,AdaBoost,0.796635,0.800946,0.796635,0.79591,0.792308,0.795035,0.792308,0.791827
4,XGBoost,0.793029,0.795004,0.793029,0.792668,0.8,0.802163,0.8,0.799641
5,MLP,0.823077,0.825388,0.820673,0.821126,0.832692,0.835797,0.832692,0.832305
6,KNN,0.770433,0.774866,0.770433,0.769537,0.790385,0.793094,0.790385,0.789899


In [None]:
# Results of length+edit
X = length_edit.iloc[:, :-1]
y = length_edit['quality']

result_length_edit = train_and_evaluate_models(X, y)
result_length_edit

Unnamed: 0,Model,CV Accuracy,CV Precision,CV Recall,CV F1 Score,Test Accuracy,Test Precision,Test Recall,Test F1 Score
0,Logistic Regression,0.822596,0.823156,0.822596,0.822518,0.832692,0.832712,0.832692,0.83269
1,SVM,0.830288,0.834,0.830288,0.829815,0.838462,0.841131,0.838462,0.838145
2,Random Forest,0.8375,0.841643,0.837019,0.838467,0.855769,0.857679,0.855769,0.855576
3,AdaBoost,0.832692,0.835797,0.832692,0.832306,0.851923,0.856356,0.851923,0.851461
4,XGBoost,0.840385,0.841509,0.840385,0.84026,0.861538,0.86328,0.861538,0.861372
5,MLP,0.840625,0.841082,0.840144,0.843386,0.845192,0.846125,0.845192,0.845088
6,KNN,0.807933,0.812055,0.807933,0.80729,0.822115,0.828235,0.822115,0.821282


In [None]:
# Results of structure+style
X = structure_style.iloc[:, :-1]
y = structure_style['quality']

result_structure_style = train_and_evaluate_models(X, y)
result_structure_style

Unnamed: 0,Model,CV Accuracy,CV Precision,CV Recall,CV F1 Score,Test Accuracy,Test Precision,Test Recall,Test F1 Score
0,Logistic Regression,0.866106,0.866471,0.866106,0.866074,0.869231,0.869236,0.869231,0.86923
1,SVM,0.878846,0.879969,0.878846,0.878755,0.886538,0.888008,0.886538,0.886431
2,Random Forest,0.884135,0.887552,0.888462,0.885516,0.884615,0.885077,0.884615,0.884581
3,AdaBoost,0.874038,0.875087,0.874038,0.873953,0.884615,0.885077,0.884615,0.884581
4,XGBoost,0.890625,0.891357,0.890625,0.890573,0.882692,0.882834,0.882692,0.882681
5,MLP,0.885096,0.881543,0.886058,0.886016,0.873077,0.873348,0.873077,0.873054
6,KNN,0.863702,0.86679,0.863702,0.863398,0.857692,0.860513,0.857692,0.857413


In [None]:
# Results of structure+readability
X = structure_readability.iloc[:, :-1]
y = structure_readability['quality']

result_structure_readability = train_and_evaluate_models(X, y)
result_structure_readability

Unnamed: 0,Model,CV Accuracy,CV Precision,CV Recall,CV F1 Score,Test Accuracy,Test Precision,Test Recall,Test F1 Score
0,Logistic Regression,0.868269,0.868945,0.868269,0.868209,0.873077,0.873276,0.873077,0.87306
1,SVM,0.88149,0.882839,0.88149,0.881381,0.895192,0.896991,0.895192,0.895073
2,Random Forest,0.889423,0.890757,0.888221,0.886959,0.884615,0.88482,0.884615,0.8846
3,AdaBoost,0.872837,0.873793,0.872837,0.872754,0.873077,0.873099,0.873077,0.873075
4,XGBoost,0.888462,0.889462,0.888462,0.888383,0.875,0.875139,0.875,0.874988
5,MLP,0.886298,0.890896,0.888221,0.891052,0.8875,0.888709,0.8875,0.887412
6,KNN,0.858894,0.862474,0.858894,0.85853,0.872115,0.876022,0.872115,0.871782


In [None]:
# Results of structure+edit
X = structure_edit.iloc[:, :-1]
y = structure_edit['quality']

result_structure_edit = train_and_evaluate_models(X, y)
result_structure_edit

Unnamed: 0,Model,CV Accuracy,CV Precision,CV Recall,CV F1 Score,Test Accuracy,Test Precision,Test Recall,Test F1 Score
0,Logistic Regression,0.874279,0.874525,0.874279,0.87426,0.875,0.87545,0.875,0.874963
1,SVM,0.885817,0.887526,0.885817,0.885693,0.890385,0.893464,0.890385,0.89017
2,Random Forest,0.892788,0.891703,0.891106,0.892734,0.898077,0.899234,0.898077,0.898003
3,AdaBoost,0.885817,0.886939,0.885817,0.885731,0.888462,0.889291,0.888462,0.888402
4,XGBoost,0.899038,0.899795,0.899038,0.898989,0.899038,0.89969,0.899038,0.898997
5,MLP,0.901923,0.901474,0.900481,0.898037,0.882692,0.883379,0.882692,0.88264
6,KNN,0.865625,0.869593,0.865625,0.865258,0.882692,0.886864,0.882692,0.882375


In [None]:
# Results of style_readability
X = style_readability.iloc[:, :-1]
y = style_readability['quality']

result_style_readability = train_and_evaluate_models(X, y)
result_style_readability

Unnamed: 0,Model,CV Accuracy,CV Precision,CV Recall,CV F1 Score,Test Accuracy,Test Precision,Test Recall,Test F1 Score
0,Logistic Regression,0.796875,0.800347,0.796875,0.796269,0.800962,0.805161,0.800962,0.800274
1,SVM,0.794471,0.799747,0.794471,0.793577,0.811538,0.817978,0.811538,0.810579
2,Random Forest,0.792788,0.793384,0.793269,0.7919,0.811538,0.812444,0.811538,0.811402
3,AdaBoost,0.777404,0.781525,0.777404,0.776612,0.791346,0.793544,0.791346,0.790955
4,XGBoost,0.79351,0.795308,0.79351,0.793196,0.799038,0.799971,0.799038,0.798882
5,MLP,0.815865,0.817344,0.81226,0.812244,0.827885,0.831872,0.827885,0.827366
6,KNN,0.750481,0.760037,0.750481,0.748157,0.763462,0.769526,0.763462,0.762123


In [None]:
# Results of style_edit
X = style_edit.iloc[:, :-1]
y = style_edit['quality']

result_style_edit = train_and_evaluate_models(X, y)
result_style_edit

Unnamed: 0,Model,CV Accuracy,CV Precision,CV Recall,CV F1 Score,Test Accuracy,Test Precision,Test Recall,Test F1 Score
0,Logistic Regression,0.817548,0.817935,0.817548,0.817491,0.8375,0.837501,0.8375,0.8375
1,SVM,0.842308,0.844646,0.842308,0.842037,0.854808,0.856815,0.854808,0.854603
2,Random Forest,0.844231,0.845717,0.842308,0.843087,0.865385,0.866953,0.865385,0.865241
3,AdaBoost,0.845913,0.848718,0.845913,0.845603,0.855769,0.857297,0.855769,0.855615
4,XGBoost,0.848558,0.850269,0.848558,0.848371,0.8625,0.863631,0.8625,0.862393
5,MLP,0.849279,0.851424,0.85024,0.847009,0.858654,0.860104,0.858654,0.858511
6,KNN,0.804808,0.806702,0.804808,0.804514,0.842308,0.845502,0.842308,0.841942


In [None]:
# Results of readability_edit
X = readability_edit.iloc[:, :-1]
y = readability_edit['quality']

result_readability_edit = train_and_evaluate_models(X, y)
result_readability_edit

Unnamed: 0,Model,CV Accuracy,CV Precision,CV Recall,CV F1 Score,Test Accuracy,Test Precision,Test Recall,Test F1 Score
0,Logistic Regression,0.819231,0.819717,0.819231,0.819162,0.853846,0.85437,0.853846,0.853792
1,SVM,0.820433,0.825423,0.820433,0.819748,0.842308,0.84662,0.842308,0.841816
2,Random Forest,0.829087,0.829907,0.826202,0.826824,0.851923,0.854018,0.851923,0.851704
3,AdaBoost,0.830288,0.83281,0.830288,0.829979,0.843269,0.846873,0.843269,0.842861
4,XGBoost,0.830048,0.831677,0.830048,0.82984,0.842308,0.843609,0.842308,0.842158
5,MLP,0.835337,0.838689,0.837019,0.833075,0.847115,0.852625,0.847115,0.846516
6,KNN,0.797356,0.801085,0.797356,0.796727,0.8125,0.816575,0.8125,0.811895


__Cominations of three feature groups__

In [None]:
length_structure_style = df.iloc[:, list(range(2, 31)) + [-1]]
length_structure_readability = df.iloc[:, list(range(2, 24))+list(range(31, 38)) + [-1]]
length_structure_edit = df.iloc[:, list(range(2, 24))+list(range(38, 48)) + [-1]]
structure_style_readability = df.iloc[:, list(range(7, 38))+[-1]]
structure_style_edit = df.iloc[:, list(range(7, 31))+list(range(38, 48)) + [-1]]
style_readablity_edit = df.iloc[:, list(range(24, 48)) + [-1]]

In [None]:
# Results of length_structure_style
X = length_structure_style.iloc[:, :-1]
y = length_structure_style['quality']

result_length_structure_style = train_and_evaluate_models(X, y)
result_length_structure_style

Unnamed: 0,Model,CV Accuracy,CV Precision,CV Recall,CV F1 Score,Test Accuracy,Test Precision,Test Recall,Test F1 Score
0,Logistic Regression,0.873317,0.874136,0.873317,0.873251,0.877885,0.87792,0.877885,0.877882
1,SVM,0.879808,0.880985,0.879808,0.879714,0.885577,0.887332,0.885577,0.885447
2,Random Forest,0.884135,0.888876,0.88774,0.884089,0.882692,0.882896,0.882692,0.882677
3,AdaBoost,0.871394,0.872309,0.871394,0.871316,0.882692,0.883259,0.882692,0.882649
4,XGBoost,0.891106,0.891859,0.891106,0.891049,0.888462,0.888513,0.888462,0.888458
5,MLP,0.888221,0.887808,0.892067,0.890835,0.883654,0.884406,0.883654,0.883597
6,KNN,0.861298,0.865015,0.861298,0.86093,0.857692,0.861889,0.857692,0.857278


In [None]:
# Results of length_structure_readability
X = length_structure_readability.iloc[:, :-1]
y = length_structure_readability['quality']

result_length_structure_readability = train_and_evaluate_models(X, y)
result_length_structure_readability

Unnamed: 0,Model,CV Accuracy,CV Precision,CV Recall,CV F1 Score,Test Accuracy,Test Precision,Test Recall,Test F1 Score
0,Logistic Regression,0.873077,0.873958,0.873077,0.873005,0.875962,0.876698,0.875962,0.875901
1,SVM,0.883413,0.884496,0.883413,0.88333,0.892308,0.893799,0.892308,0.892206
2,Random Forest,0.884375,0.889452,0.892067,0.889607,0.883654,0.883769,0.883654,0.883645
3,AdaBoost,0.876442,0.877404,0.876442,0.876363,0.879808,0.879922,0.879808,0.879799
4,XGBoost,0.895433,0.896273,0.895433,0.895373,0.886538,0.887002,0.886538,0.886504
5,MLP,0.89399,0.89013,0.889904,0.890588,0.881731,0.882241,0.881731,0.881691
6,KNN,0.861058,0.865197,0.861058,0.86063,0.871154,0.875829,0.871154,0.870752


In [None]:
# Results of length_structure_edit
X = length_structure_edit.iloc[:, :-1]
y = length_structure_edit['quality']

result_length_structure_edit = train_and_evaluate_models(X, y)
result_length_structure_edit

Unnamed: 0,Model,CV Accuracy,CV Precision,CV Recall,CV F1 Score,Test Accuracy,Test Precision,Test Recall,Test F1 Score
0,Logistic Regression,0.880769,0.8812,0.880769,0.880737,0.886538,0.887232,0.886538,0.886488
1,SVM,0.891587,0.892966,0.891587,0.891493,0.898077,0.900446,0.898077,0.897926
2,Random Forest,0.894231,0.894808,0.896154,0.895629,0.894231,0.894815,0.894231,0.894192
3,AdaBoost,0.891106,0.892168,0.891106,0.891027,0.889423,0.88984,0.889423,0.889394
4,XGBoost,0.897356,0.897885,0.897356,0.897321,0.892308,0.892592,0.892308,0.892288
5,MLP,0.901442,0.903027,0.901923,0.901644,0.890385,0.890755,0.890385,0.890359
6,KNN,0.863942,0.868279,0.863942,0.863535,0.876923,0.883481,0.876923,0.876395


In [None]:
# Results of structure_style_readability
X = structure_style_readability.iloc[:, :-1]
y = structure_style_readability['quality']

result_structure_style_readability = train_and_evaluate_models(X, y)
result_structure_style_readability

Unnamed: 0,Model,CV Accuracy,CV Precision,CV Recall,CV F1 Score,Test Accuracy,Test Precision,Test Recall,Test F1 Score
0,Logistic Regression,0.871635,0.872302,0.871635,0.871579,0.878846,0.879301,0.878846,0.87881
1,SVM,0.881971,0.882781,0.881971,0.881909,0.8875,0.888709,0.8875,0.887412
2,Random Forest,0.888462,0.885541,0.887019,0.889136,0.886538,0.886744,0.886538,0.886523
3,AdaBoost,0.876683,0.877566,0.876683,0.876611,0.881731,0.881902,0.881731,0.881718
4,XGBoost,0.89375,0.89449,0.89375,0.893699,0.892308,0.892517,0.892308,0.892293
5,MLP,0.88726,0.884607,0.882212,0.886268,0.878846,0.878986,0.878846,0.878835
6,KNN,0.859615,0.864184,0.859615,0.859163,0.855769,0.858827,0.855769,0.855461


In [None]:
# Results of structure_style_edit
X = structure_style_edit.iloc[:, :-1]
y = structure_style_edit['quality']

result_structure_style_edit = train_and_evaluate_models(X, y)
result_structure_style_edit

Unnamed: 0,Model,CV Accuracy,CV Precision,CV Recall,CV F1 Score,Test Accuracy,Test Precision,Test Recall,Test F1 Score
0,Logistic Regression,0.877163,0.877456,0.877163,0.877141,0.888462,0.888743,0.888462,0.888441
1,SVM,0.888462,0.889717,0.888462,0.888373,0.894231,0.896819,0.894231,0.894058
2,Random Forest,0.896154,0.897087,0.894712,0.89297,0.897115,0.897446,0.897115,0.897094
3,AdaBoost,0.888702,0.889846,0.888702,0.88862,0.891346,0.891765,0.891346,0.891317
4,XGBoost,0.904808,0.905456,0.904808,0.90477,0.903846,0.904444,0.903846,0.903811
5,MLP,0.894712,0.900179,0.901202,0.894925,0.884615,0.884706,0.884615,0.884609
6,KNN,0.857933,0.861453,0.857933,0.857585,0.871154,0.874081,0.871154,0.870901


In [None]:
# Results of style_readablity_edit
X = style_readablity_edit.iloc[:, :-1]
y = style_readablity_edit['quality']

result_style_readablity_edit = train_and_evaluate_models(X, y)
result_style_readablity_edit

Unnamed: 0,Model,CV Accuracy,CV Precision,CV Recall,CV F1 Score,Test Accuracy,Test Precision,Test Recall,Test F1 Score
0,Logistic Regression,0.845192,0.846472,0.845192,0.845046,0.859615,0.860517,0.859615,0.859528
1,SVM,0.851442,0.85383,0.851442,0.85119,0.869231,0.872143,0.869231,0.868974
2,Random Forest,0.848798,0.852429,0.849279,0.847879,0.856731,0.858003,0.856731,0.856603
3,AdaBoost,0.849519,0.851296,0.849519,0.849328,0.8625,0.86415,0.8625,0.862344
4,XGBoost,0.85649,0.858019,0.85649,0.856336,0.865385,0.866953,0.865385,0.865241
5,MLP,0.860577,0.863306,0.861058,0.862585,0.869231,0.869893,0.869231,0.869172
6,KNN,0.812019,0.817304,0.812019,0.811235,0.836538,0.841079,0.836538,0.835993


__Cominations of four feature groups__

In [None]:
length_structure_style_readability = df.iloc[:, list(range(2, 38)) + [-1]]
length_structure_style_edit = df.iloc[:, list(range(2, 31))+list(range(38, 48)) + [-1]]
length_structure_readability_edit = df.iloc[:, list(range(2, 24))+list(range(31, 48)) + [-1]]
length_style_readability_edit = df.iloc[:, list(range(2, 7))+list(range(24, 48))+[-1]]
structure_style_readability_edit = df.iloc[:, list(range(7, 48))+[-1]]

In [None]:
# Results of length_structure_style_readability
X = length_structure_style_readability.iloc[:, :-1]
y = length_structure_style_readability['quality']

result_length_structure_style_readability = train_and_evaluate_models(X, y)
result_length_structure_style_readability

Unnamed: 0,Model,CV Accuracy,CV Precision,CV Recall,CV F1 Score,Test Accuracy,Test Precision,Test Recall,Test F1 Score
0,Logistic Regression,0.872356,0.873145,0.872356,0.872291,0.877885,0.878121,0.877885,0.877866
1,SVM,0.881731,0.882779,0.881731,0.881649,0.884615,0.886077,0.884615,0.884506
2,Random Forest,0.884135,0.886512,0.884135,0.889392,0.884615,0.885185,0.884615,0.884573
3,AdaBoost,0.875721,0.876631,0.875721,0.875646,0.883654,0.883769,0.883654,0.883645
4,XGBoost,0.891587,0.89241,0.891587,0.891526,0.894231,0.894604,0.894231,0.894206
5,MLP,0.886779,0.886317,0.8875,0.884307,0.874038,0.874439,0.874038,0.874005
6,KNN,0.860817,0.864936,0.860817,0.860399,0.858654,0.862712,0.858654,0.858257


In [None]:
# Results of length_structure_style_edit
X = length_structure_style_edit.iloc[:, :-1]
y = length_structure_style_edit['quality']

result_length_structure_style_edit = train_and_evaluate_models(X, y)
result_length_structure_style_edit

Unnamed: 0,Model,CV Accuracy,CV Precision,CV Recall,CV F1 Score,Test Accuracy,Test Precision,Test Recall,Test F1 Score
0,Logistic Regression,0.88125,0.881567,0.88125,0.881227,0.891346,0.891765,0.891346,0.891317
1,SVM,0.889663,0.890832,0.889663,0.889582,0.898077,0.900214,0.898077,0.897941
2,Random Forest,0.892788,0.893197,0.894471,0.895638,0.899038,0.89969,0.899038,0.898997
3,AdaBoost,0.887981,0.889014,0.887981,0.887904,0.886538,0.886905,0.886538,0.886512
4,XGBoost,0.900962,0.90169,0.900962,0.900916,0.897115,0.897646,0.897115,0.897081
5,MLP,0.897115,0.90194,0.898317,0.898764,0.892308,0.892453,0.892308,0.892298
6,KNN,0.857692,0.861548,0.857692,0.857308,0.869231,0.87296,0.869231,0.868903


In [None]:
# Results of length_structure_readability_edit
X = length_structure_readability_edit.iloc[:, :-1]
y = length_structure_readability_edit['quality']

result_length_structure_readability_edit = train_and_evaluate_models(X, y)
result_length_structure_readability_edit

Unnamed: 0,Model,CV Accuracy,CV Precision,CV Recall,CV F1 Score,Test Accuracy,Test Precision,Test Recall,Test F1 Score
0,Logistic Regression,0.883654,0.884121,0.883654,0.883619,0.888462,0.889037,0.888462,0.88842
1,SVM,0.890144,0.891036,0.890144,0.890081,0.904808,0.90665,0.904808,0.9047
2,Random Forest,0.895913,0.895638,0.896635,0.89419,0.9,0.900593,0.9,0.899963
3,AdaBoost,0.891106,0.891817,0.891106,0.891057,0.889423,0.889944,0.889423,0.889386
4,XGBoost,0.903365,0.904362,0.903365,0.903301,0.894231,0.894704,0.894231,0.894199
5,MLP,0.898317,0.898348,0.899519,0.898526,0.875962,0.876698,0.875962,0.875901
6,KNN,0.864663,0.869478,0.864663,0.864213,0.885577,0.889624,0.885577,0.885279


In [None]:
# Results of length_style_readability_edit
X = length_style_readability_edit.iloc[:, :-1]
y = length_style_readability_edit['quality']

result_length_style_readability_edit = train_and_evaluate_models(X, y)
result_length_style_readability_edit

Unnamed: 0,Model,CV Accuracy,CV Precision,CV Recall,CV F1 Score,Test Accuracy,Test Precision,Test Recall,Test F1 Score
0,Logistic Regression,0.847115,0.847937,0.847115,0.847023,0.861538,0.862187,0.861538,0.861476
1,SVM,0.85625,0.858498,0.85625,0.856031,0.873077,0.87508,0.873077,0.872907
2,Random Forest,0.862981,0.862029,0.862981,0.856411,0.869231,0.871213,0.869231,0.869056
3,AdaBoost,0.853125,0.854215,0.853125,0.853011,0.859615,0.860816,0.859615,0.859498
4,XGBoost,0.858173,0.859258,0.858173,0.858066,0.868269,0.868991,0.868269,0.868205
5,MLP,0.862981,0.870936,0.864423,0.865522,0.878846,0.879796,0.878846,0.87877
6,KNN,0.819231,0.822985,0.819231,0.818694,0.843269,0.846873,0.843269,0.842861


In [None]:
# Results of structure_style_readability_edit
X = structure_style_readability_edit.iloc[:, :-1]
y = structure_style_readability_edit['quality']

result_structure_style_readability_edit = train_and_evaluate_models(X, y)
result_structure_style_readability_edit

Unnamed: 0,Model,CV Accuracy,CV Precision,CV Recall,CV F1 Score,Test Accuracy,Test Precision,Test Recall,Test F1 Score
0,Logistic Regression,0.881731,0.882276,0.881731,0.88169,0.892308,0.892778,0.892308,0.892275
1,SVM,0.889183,0.890088,0.889183,0.889118,0.899038,0.901069,0.899038,0.898911
2,Random Forest,0.895913,0.89465,0.893029,0.893939,0.903846,0.904331,0.903846,0.903817
3,AdaBoost,0.88774,0.888789,0.88774,0.887667,0.883654,0.883894,0.883654,0.883636
4,XGBoost,0.902644,0.903565,0.902644,0.902587,0.896154,0.896177,0.896154,0.896152
5,MLP,0.89399,0.900016,0.897115,0.895395,0.885577,0.886092,0.885577,0.885539
6,KNN,0.867308,0.871969,0.867308,0.866872,0.8875,0.891264,0.8875,0.887229


### Feature selection for the best model
- structure+style+edit

In [91]:
structure_style_edit = df.iloc[:, list(range(7, 31))+list(range(38, 48)) + [-1]]

In [93]:
# Backward feature selection for structure_style_edit features
X = structure_style_edit.iloc[:, :-1]
y = structure_style_edit['quality']

model_xgb = XGBClassifier()
sequential_backward_feature_selection(X, y, model_xgb)

Selected Features: Index(['paragraph_count', 'mean_section_size', 'abstract_size',
       'abstract_text_ratio', 'citation_count', 'external_link_count',
       'internal_link_count', 'image_count', 'image_per_text', 'infobox',
       'largest_sentence_size', 'article_age_days', 'num_edits',
       'num_registered_editors', 'num_anonymous_editors', 'edit_per_editor',
       'discussion_count'],
      dtype='object')


Index(['paragraph_count', 'mean_section_size', 'abstract_size',
       'abstract_text_ratio', 'citation_count', 'external_link_count',
       'internal_link_count', 'image_count', 'image_per_text', 'infobox',
       'largest_sentence_size', 'article_age_days', 'num_edits',
       'num_registered_editors', 'num_anonymous_editors', 'edit_per_editor',
       'discussion_count'],
      dtype='object')

In [94]:
# Results with selected features
X = structure_style_edit[['paragraph_count', 'mean_section_size', 'abstract_size','abstract_text_ratio',
                          'citation_count', 'external_link_count', 'internal_link_count', 'image_count',
                          'image_per_text', 'infobox', 'largest_sentence_size', 'article_age_days', 'num_edits',
                          'num_registered_editors', 'num_anonymous_editors', 'edit_per_editor', 'discussion_count']]
y = structure_style_edit['quality']

selected_model = train_and_evaluate_models(X, y)
selected_model

Unnamed: 0,Model,CV Accuracy,CV Precision,CV Recall,CV F1 Score,Test Accuracy,Test Precision,Test Recall,Test F1 Score
0,Logistic Regression,0.872115,0.872524,0.872115,0.872081,0.876923,0.877063,0.876923,0.876912
1,SVM,0.883654,0.885546,0.883654,0.883511,0.893269,0.897089,0.893269,0.893012
2,Random Forest,0.889904,0.887918,0.887981,0.886459,0.886538,0.887829,0.886538,0.886444
3,AdaBoost,0.877163,0.878081,0.877163,0.877089,0.875962,0.876464,0.875962,0.87592
4,XGBoost,0.896154,0.897162,0.896154,0.896087,0.892308,0.892592,0.892308,0.892288
5,MLP,0.895433,0.892337,0.89375,0.890806,0.891346,0.892253,0.891346,0.891283
6,KNN,0.864904,0.869273,0.864904,0.864493,0.879808,0.883795,0.879808,0.879495
