# 01 - Sentiment Analysis with Python

### Setting up credentials to use Google Cloud

In [None]:
import os

os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = "your-credentials.json"

assert os.getenv("GOOGLE_APPLICATION_CREDENTIALS") != "your-credentials.json", "Please set your credentials path"

### Running query to BigQuery from Jupyter notebook

In [None]:
# load the Google Cloud Bigquery extension for jupyter notebook
%load_ext google.cloud.bigquery

try querying 10 rows to test the connection


In [None]:
%%bigquery

SELECT
    review,
    label
FROM
    `bigquery-public-data.imdb.reviews`
WHERE
    label IN ("Negative", "Positive")
    AND split = "train"
LIMIT 10

### Querying train dataset

In [None]:
%%bigquery train_df

SELECT
    DISTINCT review,
    label
FROM
    `bigquery-public-data.imdb.reviews`
WHERE
    label IN ("Negative", "Positive")
    AND split = "train"

In [None]:
train_df.head()

In [None]:
train_df.shape

In [None]:
train_df.info()

In [None]:
train_df.describe()

In [None]:
train_df['label'].value_counts()

### Querying test dataset

In [None]:
%%bigquery test_df

SELECT
    DISTINCT review,
    label
FROM
    `bigquery-public-data.imdb.reviews`
WHERE
    label IN ("Negative", "Positive")
    AND split = "test"

In [None]:
test_df.head()

In [None]:
test_df.shape

In [None]:
test_df.info()

In [None]:
test_df.describe()

In [None]:
test_df['label'].value_counts()

### Training a classification model

In [None]:
import pandas as pd

from sklearn import linear_model, metrics
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
# create count vector with regex token pattern of any word with more than 2 a-z characters
count_vectorizer = CountVectorizer(token_pattern="[a-z]{2,}")

# prepare train dataset (features and labels)
x_train = count_vectorizer.fit_transform(train_df['review'])
y_train = train_df['label'].apply(lambda x: 1 if x == "Negative" else 0)

# prepare test dataset (features and labels)
x_test = count_vectorizer.transform(test_df['review'])
y_test = test_df['label'].apply(lambda x: 1 if x == "Negative" else 0)

In [None]:
# visualize bag-of-words features produced from sklearn
df_bow_sklearn = pd.DataFrame(x_train.toarray(), columns=count_vectorizer.get_feature_names_out())
df_bow_sklearn.head()

In [None]:
# train a logistic regression model using train dataset
model = linear_model.LogisticRegression(solver="lbfgs", max_iter=1000)
model.fit(x_train, y_train)

In [None]:
# test the trained model using test dataset
predictions = model.predict(x_test)

accuracy = metrics.accuracy_score(predictions, y_test)
f1_score = metrics.f1_score(predictions, y_test, average='macro')

accuracy, f1_score