In [7]:
import pandas as pd

# load raw dataset
src = 'data/training_data_features.csv'
training_data = pd.read_csv(src)

# load grouped validation data
src_validation = 'data/validation_data_features.csv'
validation_data = pd.read_csv(src_validation)

In [None]:
training_data.columns

### Function for bag-of-words

In [2]:
from sklearn.feature_extraction.text import CountVectorizer

def create_bag_of_words(text):
    count_vectorizer = CountVectorizer()

    # Fit the vectorizer to the text data and transform the text
    bag_of_words = count_vectorizer.fit_transform([text])

    # Get feature names (words) from the vectorizer
    feature_names = count_vectorizer.get_feature_names_out()

    word_counts = {}
    for col in bag_of_words.nonzero()[1]:
        word_counts[feature_names[col]] = bag_of_words[0, col]

    return word_counts


input_text = "This is a sample text for testing bag of words."
bag_of_words_result = create_bag_of_words(input_text)
print(bag_of_words_result)


{'this': 1, 'is': 1, 'sample': 1, 'text': 1, 'for': 1, 'testing': 1, 'bag': 1, 'of': 1, 'words': 1}


### Logistic Regression with TF-IDF

In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from scipy.sparse import hstack
import pandas as pd

tfidf_vectorizer = TfidfVectorizer()

tfidf_training_matrix = tfidf_vectorizer.fit_transform(training_data['content_stem'])

tfidf_validation_matrix = tfidf_vectorizer.transform(validation_data['content_stem'])

X_training_numeric = training_data[['date_count', 'url_count', 'exclm_count',
                                    'content_word_freq', 'stop_word_freq', 'stem_word_freq',
                                    'stop_reduction_rate', 'stem_reduction_rate', 'average_sentence_length']]

X_training_combined = hstack((tfidf_training_matrix, X_training_numeric))

X_validation_numeric = validation_data[['date_count', 'url_count', 'exclm_count',
                                        'content_word_freq', 'stop_word_freq', 'stem_word_freq',
                                        'stop_reduction_rate', 'stem_reduction_rate', 'average_sentence_length']]


X_validation_combined = hstack((tfidf_validation_matrix, X_validation_numeric))

y_training_data = training_data['reliable']
y_validation_data = validation_data['reliable']

logistic_model = LogisticRegression()
logistic_model.fit(X_training_combined, y_training_data)

predictions = logistic_model.predict(X_validation_combined)

print("Logistic Regression Performance:")
print(classification_report(y_validation_data, predictions))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Logistic Regression Performance:
              precision    recall  f1-score   support

       False       0.76      0.74      0.75     21571
        True       0.75      0.77      0.76     22002

    accuracy                           0.76     43573
   macro avg       0.76      0.76      0.76     43573
weighted avg       0.76      0.76      0.76     43573



### Neural Network

With two hidden layers

In [6]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report
from scipy.sparse import hstack


tfidf_vectorizer = TfidfVectorizer()
tfidf_training_matrix = tfidf_vectorizer.fit_transform(training_data['content_stem'])
tfidf_validation_matrix = tfidf_vectorizer.transform(validation_data['content_stem'])


X_training_numeric = training_data[['date_count', 'url_count', 'exclm_count',
                                    'content_word_freq', 'stop_word_freq', 'stem_word_freq',
                                    'stop_reduction_rate', 'stem_reduction_rate', 'average_sentence_length']]
X_validation_numeric = validation_data[['date_count', 'url_count', 'exclm_count',
                                        'content_word_freq', 'stop_word_freq', 'stem_word_freq',
                                        'stop_reduction_rate', 'stem_reduction_rate', 'average_sentence_length']]

X_training_combined = hstack((tfidf_training_matrix, X_training_numeric.values))
X_validation_combined = hstack((tfidf_validation_matrix, X_validation_numeric.values))


y_training_data = training_data['reliable']
y_validation_data = validation_data['reliable']

mlp_model = MLPClassifier(hidden_layer_sizes=(100, 50), max_iter=100, random_state=42)
mlp_model.fit(X_training_combined, y_training_data)

predictions = mlp_model.predict(X_validation_combined)

print(classification_report(y_validation_data, predictions))




              precision    recall  f1-score   support

       False       0.96      0.97      0.96     21571
        True       0.97      0.96      0.96     22002

    accuracy                           0.96     43573
   macro avg       0.96      0.96      0.96     43573
weighted avg       0.96      0.96      0.96     43573



With single hidden layer

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler
from scipy.sparse import hstack
import pandas as pd

tfidf_vectorizer = TfidfVectorizer()

tfidf_training_matrix = tfidf_vectorizer.fit_transform(training_data['content_stem'])

tfidf_validation_matrix = tfidf_vectorizer.transform(validation_data['content_stem'])

numeric_features = ['date_count', 'url_count', 'exclm_count',
                    'content_word_freq', 'stop_word_freq', 'stem_word_freq',
                    'stop_reduction_rate', 'stem_reduction_rate', 'average_sentence_length']

X_training_numeric = training_data[numeric_features]
X_validation_numeric = validation_data[numeric_features]

X_training_combined = hstack((tfidf_training_matrix, X_training_numeric))
X_validation_combined = hstack((tfidf_validation_matrix, X_validation_numeric))

# Scale input features
scaler = StandardScaler(with_mean=False)  # Pass with_mean=False for sparse matrices
X_training_scaled = scaler.fit_transform(X_training_combined)
X_validation_scaled = scaler.transform(X_validation_combined)

y_training_data = training_data['reliable']
y_validation_data = validation_data['reliable']

mlp_model = MLPClassifier(hidden_layer_sizes=(100,), max_iter=500, batch_size=256, early_stopping=True, verbose=True)
mlp_model.fit(X_training_scaled, y_training_data)

predictions = mlp_model.predict(X_validation_scaled)

print("MLP Classifier Performance:")
print(classification_report(y_validation_data, predictions))


### Support Vector Machine

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import classification_report
from scipy.sparse import hstack
import pandas as pd

tfidf_vectorizer = TfidfVectorizer()

tfidf_training_matrix = tfidf_vectorizer.fit_transform(training_data['content_stem'])

tfidf_validation_matrix = tfidf_vectorizer.transform(validation_data['content_stem'])

X_training_numeric = training_data[['num_count', 'date_count', 'url_count', 'comma_count', 'exclm_count',
                                    'content_word_freq', 'stop_word_freq', 'stem_word_freq',
                                    'stop_reduction_rate', 'stem_reduction_rate', 'average_sentence_length']]

X_training_combined = hstack((tfidf_training_matrix, X_training_numeric))

X_validation_numeric = validation_data[['num_count', 'date_count', 'url_count', 'comma_count', 'exclm_count',
                                        'content_word_freq', 'stop_word_freq', 'stem_word_freq',
                                        'stop_reduction_rate', 'stem_reduction_rate', 'average_sentence_length']]

X_validation_combined = hstack((tfidf_validation_matrix, X_validation_numeric))

y_training_data = training_data['reliable']
y_validation_data = validation_data['reliable']

svm_model = SVC()
svm_model.fit(X_training_combined, y_training_data)

predictions = svm_model.predict(X_validation_combined)

print("Support Vector Machine (SVM) Performance:")
print(classification_report(y_validation_data, predictions))
