In [None]:
import numpy as np
import pandas as pd
import seaborn as sn
import matplotlib.pyplot as plt
import pickle

np.random.seed(1)

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, f1_score, confusion_matrix
from sklearn.model_selection import train_test_split

# 数据准备

1. 数据导入
2. 数据清洗
3. 拆分训练集、测试集
4. 向量化

In [None]:
df_ = pd.read_csv('AirlineTweets.csv')

In [None]:
df_.head()

In [None]:
df = df_[['airline_sentiment', 'text']].copy()

In [None]:
df.head()

In [None]:
df['airline_sentiment'].hist()

In [None]:
target_map = {'positive': 1, 'negative': 0, 'neutral': 2}
df['target'] = df['airline_sentiment'].map(target_map)

In [None]:
df.head()

In [None]:
df_train, df_test, = train_test_split(df)

In [None]:
df_train.head()

In [None]:
vectorizer = TfidfVectorizer(max_features=2000)

In [None]:
X_train = vectorizer.fit_transform(df_train['text'])
X_test = vectorizer.transform(df_test['text'])
X_train.shape


In [None]:
Y_train = df_train['target']
Y_test = df_test['target']

# 训练模型

In [None]:
model = LogisticRegression(max_iter=500)
model.fit(X_train, Y_train)
print('Train acc: ', model.score(X_train, Y_train))
print("Test  acc: ", model.score(X_test, Y_test))

In [None]:
Pr_train = model.predict_proba(X_train)
Pr_test = model.predict_proba(X_test)
print("Train AUC: ", roc_auc_score(Y_train, Pr_train, multi_class='ovo'))
print("Test  AUC: ", roc_auc_score(Y_test, Pr_test, multi_class='ovo'))

In [None]:
P_train = model.predict(X_train)
P_test  = model.predict(X_test) 

In [None]:
cm = confusion_matrix(Y_train, P_train, normalize='true')
cm

In [None]:
def plot_cm(cm):
    classes = ['negative', 'positive', 'neutral']
    df_cm = pd.DataFrame(cm, index=classes, columns=classes)
    ax = sn.heatmap(df_cm, annot=True, fmt='g')
    ax.set_xlabel('Predicted')
    ax.set_ylabel("Target")

plot_cm(cm)

In [None]:
cm_test = confusion_matrix(Y_test, P_test, normalize='true')
plot_cm(cm_test)

# 保存模型，以备使用。

In [None]:
pickle.dump(vectorizer, open('./sentiment-analysis-lr-vocabulary.pkl', "wb"))
pickle.dump(model, open('./sentiment-analysis-lr-model.pkl', 'wb'))
