# Logistic Regression Classifyer
___

This model is based on:

```Bibtex
@inproceedings{levyContextDependentClaim2014a,
  title = {Context Dependent Claim Detection},
  author = {Levy, Ran and Bilu, Yonatan and Hershcovich, Daniel and Aharoni, Ehud and Slonim, Noam},
  date = {2014},
  url = {https://aclanthology.org/C14-1141/},
}
```

Features:
- sentence-topic similarity
- Linguistic expansion
- Keyword that
- sentiment
- subjectivity

Parameter:

In [26]:
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, f1_score, precision_score, recall_score, accuracy_score
from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn.preprocessing import StandardScaler

from src.features import ThatToken, Sentiment, Subjectivity, SentenceTopicSimilarity
from src.dataset import load_dataset
from config import PROJECT_NAME, DATASETS

import wandb

In [27]:
MODEL_NAME = "LogisticRegression"
TRACKING = True
FALSE_CLASS_BALANCE = 1.0

dataset = DATASETS["dataset_2014"]

### 0. Load data

In [28]:
X_train, X_test, y_train, y_test = load_dataset(dataset_path=os.path.join(dataset["base_path"], dataset["data"]), false_class_balance=FALSE_CLASS_BALANCE)

### 1. Encode features

In [29]:
text_features = FeatureUnion(transformer_list=[("tf-idf", TfidfVectorizer())])

In [30]:
column_trans = ColumnTransformer(
    [
        ("tf-idf", text_features, "Sentence"),
        ("that", ThatToken(), "Sentence"),
        ("sentiment", Sentiment(), "Sentence"),
        ("subjectivity", Subjectivity(), "Sentence"),
        ("similarity", SentenceTopicSimilarity(), ["Sentence", "Article"]),
    ],
    remainder="drop",
    verbose=True,
)

### 2. Create model

In [31]:
pipe = Pipeline(
    [
        ("preprocessing", column_trans),
        ("scaler", StandardScaler(with_mean=False)),
        ("classify", LogisticRegression(max_iter=200)),
    ],
    verbose=True,
)

### 4. Train model

In [32]:
if TRACKING:
    wandb.init(project=PROJECT_NAME,
            config={
                "model": MODEL_NAME,
                "setup": str(pipe.get_feature_names_out),
                "dataset": dataset["name"],
                "train_data_size": len(X_train),
                "validation_data_size": 0,
                "test_data_size": len(X_test),
                "batch_size": None,
                "learning_rate": None,
                "epochs": None,
                "false_class_balance": FALSE_CLASS_BALANCE
            })

[34m[1mwandb[0m: wandb version 0.12.9 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade


In [33]:
pipe.fit(X_train, y_train)

[ColumnTransformer] ........ (1 of 5) Processing tf-idf, total=   0.1s
[ColumnTransformer] .......... (2 of 5) Processing that, total=   0.0s
[ColumnTransformer] ..... (3 of 5) Processing sentiment, total=   0.3s
[ColumnTransformer] .. (4 of 5) Processing subjectivity, total=   0.2s
[ColumnTransformer] .... (5 of 5) Processing similarity, total=   1.8s
[Pipeline] ..... (step 1 of 3) Processing preprocessing, total=   2.4s
[Pipeline] ............ (step 2 of 3) Processing scaler, total=   0.0s
[Pipeline] .......... (step 3 of 3) Processing classify, total=   0.1s


Pipeline(steps=[('preprocessing',
                 ColumnTransformer(transformers=[('tf-idf',
                                                  FeatureUnion(transformer_list=[('tf-idf',
                                                                                  TfidfVectorizer())]),
                                                  'Sentence'),
                                                 ('that', ThatToken(),
                                                  'Sentence'),
                                                 ('sentiment', Sentiment(),
                                                  'Sentence'),
                                                 ('subjectivity',
                                                  Subjectivity(), 'Sentence'),
                                                 ('similarity',
                                                  SentenceTopicSimilarity(),
                                                  ['Sentence', 'Article'])],
           

### 5. Predict results

In [34]:
Y_pred = pipe.predict(X_test)

### 6. Evaluate results

In [35]:
print(classification_report(y_test, Y_pred))

              precision    recall  f1-score   support

       False       0.76      0.75      0.76       147
        True       0.75      0.77      0.76       147

    accuracy                           0.76       294
   macro avg       0.76      0.76      0.76       294
weighted avg       0.76      0.76      0.76       294



In [36]:
f1 = f1_score(y_test, Y_pred)
recall = recall_score(y_test, Y_pred)
precision = precision_score(y_test, Y_pred)
accuracy = accuracy_score(y_test, Y_pred)

In [37]:
if TRACKING:
    wandb.log({'test_f1': f1})
    wandb.log({'test_recall': recall})
    wandb.log({'test_precision': precision})
    wandb.log({'test_accuracy': accuracy})
    wandb.finish()

0,1
test_accuracy,▁
test_f1,▁
test_precision,▁
test_recall,▁

0,1
test_accuracy,0.7585
test_f1,0.76094
test_precision,0.75333
test_recall,0.76871
