In [1]:
import pandas as pd
import matplotlib.pyplot as plt

# learners
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.dummy import DummyClassifier

# embeddings
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

# preprocessing
from sklearn.model_selection import train_test_split

# evaluation
from sklearn.metrics import classification_report

# pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

## Question 1

- Create a dataframe by reading the file `eng.csv`. You can find the .csv file under "files" on Canvas. (We suggest you take a sample of the data, for example with 1,000, 10,000 or 20,000 instances, to save computing time.) 

- Use value_counts to see the counts for each value of the column, `emotion`. 

We will create models to predict emotions based on texts. Assign the `text` column to X and the `emotion` column to y.

In [9]:
df = pd.read_csv("data/eng.csv").sample(1000)
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1000 entries, 203231 to 148352
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   text      1000 non-null   object
 1   emotion   1000 non-null   object
 2   language  1000 non-null   object
dtypes: object(3)
memory usage: 31.2+ KB


In [10]:
df["emotion"].value_counts()

anticipation    413
joy             382
sadness         120
anger            68
fear             17
Name: emotion, dtype: int64

In [11]:
X = df['text']
y = df['emotion']

In [12]:
df["emotion"].value_counts()

anticipation    413
joy             382
sadness         120
anger            68
fear             17
Name: emotion, dtype: int64

## Question 2

- Perform a train test split and then fit_transform a `Countvectorizer` on X_train and transform X_test.

In [13]:
(
    X_train,
    X_test,
    y_train,
    y_test
) = train_test_split(
    X,
    y,
    stratify=y,
    shuffle=True,
    random_state=1
)

In [14]:
# Vectorizer
cv = CountVectorizer()
X_train_cv = cv.fit_transform(X_train)
X_test_cv = cv.transform(X_test)

#### Plotting top N tokens

In [None]:
N = 20

def top_n_tokens(X, cv, N, plot=True):
    sum_words = X.sum(axis=0)
    words_freq = [(word, sum_words[0, idx]) for word, idx in cv.vocabulary_.items()]
    top_words_freq = sorted(words_freq, key = lambda x: x[1], reverse=True)[:N]

    if plot:
        plt.figure(figsize=(10, 5))
        plt.bar([w[0] for w in top_words_freq], [w[1] for w in top_words_freq])
        plt.xticks(rotation=45)
        plt.xlabel('Token')
        plt.ylabel('Frequency')
        plt.show()
    else:
        return top_words_freq
    
top_n_tokens(X_train_cv, cv, N)


## Question 3

- Build one or more classifiers. 
- Report the accuracy score for vectorized train and test data.
- Print a classification report for model performance on vectorized test data

In [15]:
y_train

235123    anticipation
160087    anticipation
44494              joy
276268             joy
270096    anticipation
              ...     
19111            anger
224359    anticipation
62715              joy
148387             joy
199116    anticipation
Name: emotion, Length: 750, dtype: object

In [16]:
lr = LogisticRegression(solver='lbfgs', max_iter=5000)
lr.fit(X_train_cv, y_train)

In [None]:
lr_score = lr.score(X_train_cv, y_train)
print("Train", lr_score)

lr_score = lr.score(X_test_cv, y_test)
print("Test", lr_score)

In [None]:
y_pred = lr.predict(X_test_cv)
print(classification_report(y_test, y_pred))

In [None]:
sgd = SGDClassifier(loss='modified_huber', max_iter=5000)
sgd.fit(X_train_cv, y_train)

sgd_score = sgd.score(X_train_cv, y_train)
print("Train", sgd_score)

sgd_score = sgd.score(X_test_cv, y_test)
print("Test", sgd_score)

y_pred = sgd.predict(X_test_cv)
print(classification_report(y_test, y_pred))


## Question 4
In the above model(s), we used unigrams (single words) only. This is the default for count_vectorizer. 

- Try with unigrams and bigrams, and also unigrams, bigrams and trigrams.  
You do this by setting ngram_range for `CountVectorizer`. 

- Build a logistic regression model for each of these settings and report on the results.

In [None]:
def ngram_experiment(data, count_vect, ngram_range, model1, model2):

    results = []

    model1_name = model1.__class__.__name__
    model2_name = model2.__class__.__name__
    
    for ngrams in ngram_range:
        print("Running n-grams:", ngrams)
        count_vect.ngram_range = ngrams

        X_train_cv = count_vect.fit_transform(data["X_train"])
        model1.fit(X_train_cv, data["y_train"])
        model2.fit(X_train_cv, data["y_train"])


        X_test_cv = count_vect.transform(data["X_test"])

        results.append(
            {
                "ngrams": ngrams,
                "vocabulary_size": len(count_vect.vocabulary_),
                f"{model1_name}_train": model1.score(X_train_cv, data["y_train"]).round(3),
                f"{model1_name}_test": model1.score(X_test_cv, data["y_test"]).round(3),
                f"{model2_name}_train": model2.score(X_train_cv, data["y_train"]).round(3),
                f"{model2_name}_test": model2.score(X_test_cv, data["y_test"]).round(3),
            }
        )
    return (
        pd.DataFrame(results)
        .assign(
            mean_train=lambda x: x[[f"{model1_name}_train", f"{model2_name}_train"]].mean(axis=1).round(3),
        )
        .sort_values("mean_train", ascending=False)
    )

    


ngram_experiment(
    data={
        "X_train": X_train,
        "y_train": y_train,
        "X_test": X_test,
        "y_test": y_test,
    },
    count_vect=cv,
    ngram_range=[(1, 1), (1, 2), (1, 3), (2, 3)],
    model1=lr,
    model2=sgd,
)

<h2>Question 5</h2>

- Use dummy classifier with the default settings (most frequent class), and the uniform strategy (random guessing). 

- Select the `CountVectorizer` with the optimal `nram_range`. Use the vectorized version of the data from this model.
- Print the train and test results for each dummy classifier, to determine some baselines for comparison.

In [None]:
cv = CountVectorizer(ngram_range=(1, 3))

X_train_cv = cv.fit_transform(X_train)
X_test_cv = cv.transform(X_test)

In [None]:
dummy = DummyClassifier()
dummy.fit(X_train_cv, y_train)

print("Train", dummy.score(X_train_cv, y_train))
print("Test", dummy.score(X_test_cv, y_test))

In [None]:
dummy.strategy = "uniform"
dummy.fit(X_train_cv, y_train)

print("Train", dummy.score(X_train_cv, y_train))
print("Test", dummy.score(X_test_cv, y_test))

## Question 6

- Use the `TfidfTransformer`, to create Term Frequency - Inverse Document Frequency (tfidf) scores instead of frequency scores. 
- You can apply the `TfidfTransformer` on the vectors created by `CountVectorizer`, using the fit_transform method just as is done with `CountVectorizer`. 
- Create a logistic regression model with the data produced by `TfidfTransformer`, and report the scores on train and test.

- **TIP**: Instead of calculating frequencies with `CountVectorizer` and *then* calculating TF-IDF scores from said frequencies, you can import and call `TfidfVectorizer` directly, with the same syntax as you used for `CountVectorizer`

In [None]:
tf_idf = TfidfVectorizer(ngram_range=(1, 3), stop_words="english")
X_train_tfidf = tf_idf.fit_transform(X_train)
X_test_tfidf = tf_idf.transform(X_test)

In [None]:
# plot top 20 tokens
top_n_tokens(X_train_tfidf, tf_idf, N=30)

In [None]:
lr = LogisticRegression(solver='lbfgs', multi_class='auto',random_state=0, max_iter=5_000)

lr.fit(X_train_tfidf, y_train)
lr_score = lr.score(X_train_tfidf, y_train)
print("Train", lr_score)
lr_score = lr.score(X_test_tfidf, y_test)
print("Test", lr_score)

In [None]:
preds = lr.predict(X_test_tfidf)
print(classification_report(y_test, preds))

<h2>Question 7</h2>


- Create a Scikit-Learn `Pipeline`, consisting of `CountVectorizer`, `TfidfTransformer`, and `LogisticRegression`. 

- Apply the pipeline to the training data, just as in the previous question, and report results on train and test.

In [None]:
pipe = Pipeline(
    [
        ("tfidf", TfidfVectorizer()),
        ("clf", LogisticRegression(max_iter=5000))
    ],
    verbose=True
)
pipe.fit(X_train,y_train)

print("Train", pipe.score(X_train, y_train))
print("Test", pipe.score(X_test, y_test))


<h2>Question 8</h2>

- Use the above pipeline with GridSearchCV. 

**Hint** You can use the following choices for parameters: 
- for CountVectorizer, use ngram ranges of (1,1), (1,2), and (1,3). 
- For TfidfTransformer set the *use_idf* parameter to `True` or `False`. Print the best score and best parameter choices.

In [None]:
params = { 
    "tfidf__ngram_range": ((1,1), (1,2), (1,3)),
    "tfidf__stop_words": (None, "english"),
    "tfidf__min_df": (0.1, 1),
    "tfidf__max_df": (0.5, 0.75, 1),
    "clf__penalty": ("l1", "l2"),
    "clf__solver": ("liblinear", "saga", "lbfgs"),
  }
grid_search = GridSearchCV(pipe, params, n_jobs=-1, verbose=1,cv=3)
grid_search.fit(X_train, y_train)

In [None]:
print(f"Best score: {grid_search.best_score_:.3f}")
print("Best parameters set:")
best_parameters = grid_search.best_estimator_.get_params()
for param_name in sorted(params.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))

print("Train", grid_search.score(X_train, y_train))
print("Test", grid_search.score(X_test, y_test))

In [None]:
param_char = {
    "tfidf__ngram_range": ((4,7), (5, 8)),
    "tfidf__analyzer": ["char"],
    "tfidf__min_df": (0.05, 0.1, 1),
    "tfidf__max_df": (0.75, 1),
    "clf__solver": ("liblinear", "lbfgs"),

  }
grid_search = GridSearchCV(pipe, param_char, n_jobs=-2, verbose=1,cv=3)
grid_search.fit(X_train, y_train)

In [None]:
print(f"Best score: {grid_search.best_score_:.3f}")
print("Best parameters set:")

best_parameters = grid_search.best_estimator_.get_params()
for param_name in sorted(param_char.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))

print("Train", grid_search.score(X_train, y_train))
print("Test", grid_search.score(X_test, y_test))

<h2>Question 9</h2>
Use classification report with the best model resulting from grid_search in the previous two questions.

In [None]:
print(classification_report(y_test, grid_search.predict(X_test), zero_division=0))

## Bonus: Question 10
Let's go back to question 1 and convert the problem to a binary one. 

- Map ("joy", "anticipation") to "positive" and ("anger", "fear", "sadness") to "negative" (Use the snippet below)

- Then rerun the experiments above with this new scope. 
- Report what your new baseline and comment on the effect of changing the scope.

In [None]:
df["emotion"] = df["emotion"].replace(
    {
        "joy": "positive",
        "anticipation": "positive",
        "anger": "negative",
        "fear": "negative",
        "sadness": "negative"
    }
)