In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split


data = pd.read_csv("itog_data.csv")


data['combined_questions'] = data['question_1'] + ' ' + data['question_2'] + ' ' + data['question_3'] + ' ' + data['question_4'] + ' ' + data['question_5']


X = data['combined_questions']
y_relevant = data['is_relevant']
y_object = data['object']
y_positive = data['is_positive']


X_train, X_test, y_train_relevant, y_test_relevant = train_test_split(X, y_relevant, test_size=0.2, random_state=42)
X_train, X_test, y_train_object, y_test_object = train_test_split(X, y_object, test_size=0.2, random_state=42)
X_train, X_test, y_train_positive, y_test_positive = train_test_split(X, y_positive, test_size=0.2, random_state=42)


tfidf_vectorizer = TfidfVectorizer()
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)


In [2]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

lr_relevant = LogisticRegression()
lr_relevant.fit(X_train_tfidf, y_train_relevant)
y_pred_relevant = lr_relevant.predict(X_test_tfidf)
accuracy_relevant = accuracy_score(y_test_relevant, y_pred_relevant)
print("Accuracy for 'is_relevant':", accuracy_relevant)


lr_object = LogisticRegression()
lr_object.fit(X_train_tfidf, y_train_object)
y_pred_object = lr_object.predict(X_test_tfidf)
accuracy_object = accuracy_score(y_test_object, y_pred_object)
print("Accuracy for 'object':", accuracy_object)

lr_positive = LogisticRegression()
lr_positive.fit(X_train_tfidf, y_train_positive)
y_pred_positive = lr_positive.predict(X_test_tfidf)
accuracy_positive = accuracy_score(y_test_positive, y_pred_positive)
print("Accuracy for 'is_positive':", accuracy_positive)


Accuracy for 'is_relevant': 0.9923954372623575
Accuracy for 'object': 0.9581749049429658
Accuracy for 'is_positive': 0.9923954372623575


In [7]:
from sklearn.metrics import precision_score, recall_score, f1_score


def evaluate_model(y_true, y_pred):
    precision = precision_score(y_true, y_pred, average='macro')
    recall = recall_score(y_true, y_pred, average='macro')
    f1 = f1_score(y_true, y_pred, average='macro')
    return precision, recall, f1


precision_relevant, recall_relevant, f1_relevant = evaluate_model(y_test_relevant, y_pred_relevant)
print("Metrics for 'is_relevant':")
print("Precision:", precision_relevant)
print("Recall:", recall_relevant)
print("F1-score:", f1_relevant)


precision_object, recall_object, f1_object = evaluate_model(y_test_object, y_pred_object)
print("\nMetrics for 'object':")
print("Precision:", precision_object)
print("Recall:", recall_object)
print("F1-score:", f1_object)


precision_positive, recall_positive, f1_positive = evaluate_model(y_test_positive, y_pred_positive)
print("\nMetrics for 'is_positive':")
print("Precision:", precision_positive)
print("Recall:", recall_positive)
print("F1-score:", f1_positive)

Metrics for 'is_relevant':
Precision: 0.9940828402366864
Recall: 0.9895833333333333
F1-score: 0.9917606516290728

Metrics for 'object':
Precision: 0.9600032986970146
Recall: 0.9581591779867642
F1-score: 0.9585161739319954

Metrics for 'is_positive':
Precision: 0.9935897435897436
Recall: 0.9908256880733946
F1-score: 0.9921445639187574


In [8]:
df_test = pd.read_csv('train_data.csv')

In [9]:
df_test['combined_questions'] = df_test['question_1'] + ' ' + df_test['question_2'] + ' ' + df_test['question_3'] + ' ' + df_test['question_4'] + ' ' + df_test['question_5']

test_x = tfidf_vectorizer.transform(df_test['combined_questions'])
test_y_relevant = df_test['is_relevant']
test_y_object = df_test['object']
test_y_positive = df_test['is_positive']

In [11]:
evaluate_model(test_y_relevant, lr_relevant.predict(test_x))

(0.8530219780219781, 0.8294871794871794, 0.8406629834254142)

In [12]:
evaluate_model(test_y_object, lr_object.predict(test_x))

(0.9244689221085759, 0.9229741019214703, 0.9221054705595068)

In [13]:
evaluate_model(test_y_positive, lr_positive.predict(test_x))

(0.8535947712418301, 0.8259036144578313, 0.8386591478696741)