In [1]:
import pandas as pd
import numpy as np
import operator
from sklearn.model_selection import train_test_split


np.random.seed(0)
BASE_DIR = ''

MAX_SENTENCE_PER_SESSION = 1

VALIDATION_SPLIT = 0.1
PRELOAD = False

REGRESSION = False

SENTENCE_EMBEDDING_SIZE = None
SESSION_EMBEDDING_SIZE = None

from liwc_tagger import tag, get_features

features, liwc = get_features('liwc_feature.json')


def split_train_test_set(df):
    msk = np.random.rand(len(df)) < 0.8
    train = df[msk]
    test = df[~msk]
    return train, test


langdetect_count = 0

df_reviews = pd.read_csv('oneperline.csv')  # , encoding='utf-8')
df_reviews['len'] = df_reviews.text.str.len()
df_reviews['rating'] = df_reviews['rating'].round()

df_reviews = df_reviews[df_reviews['len'].between(10, 4000)]
df_reviews = df_reviews[df_reviews.rating!=3]
df_reviews['rating'] = np.where(df_reviews.rating > 3, 1, 0)


df_rev_balanced = df_reviews






def truncate_or_pad(sentence, n):
    sentence_seq = list(map(lambda x: x.strip(), sentence.split("\n")))
    L = len(sentence_seq)
    if L >= n:
        return sentence_seq[:n]
    else:
        return [""] * (n - L) + sentence_seq


pad_sentence = df_rev_balanced.text.map(lambda x: truncate_or_pad(x, MAX_SENTENCE_PER_SESSION)).values

counter = 0
def sentence_embedding_func(last_str, str):
    global counter
    global SENTENCE_EMBEDDING_SIZE
    words = len(str.split(" "))

    if str.startswith("AAAAA"):
        embed_speaker = [0, 1]
    elif str.startswith("UUUUU"):
        embed_speaker = [1, 0]
    else:
        embed_speaker = [0, 0]

    if str.startswith("AAAAA") or str.startswith("UUUUU"):
        str = str[8:]

    embed_sentence_length = [min(len(str), 400), ]
    embed_word_length = [min(len(str.strip().split(" ")), 100), ]

    pos_word = ["love", "friend"]
    neg_word = ["stupid", "idiot", "fuck"]
    pos = False
    neg = False
    for p in pos_word:
        if p in str:
            pos = True
            break

    for n in neg_word:
        if n in str:
            neg = True
            break

    embed_sentiment = [1 if pos else 0, 1 if neg else 0]

    if len(last_str) == 0 or len(str) == 0:
        embed_overlap = [0,
                         0,
                         0,
                         0,
                         0,
                         0]
        embed_uniq_rate = [0, 0]
    else:
        this_vocab = set(str.strip("\n").split(" "))
        last_vocab = set(str.strip("\n").split(" "))
        embed_overlap = [1,
                         len(this_vocab),
                         len(last_vocab),
                         len(this_vocab | last_vocab),
                         len(this_vocab & last_vocab),
                         len(this_vocab | last_vocab) / len(this_vocab & last_vocab),
                         ]

        embed_uniq_rate = [1, len(this_vocab) / words]

    if "wh" in str or "how" in str:
        embed_question = [1]
    else:
        embed_question = [0]

    ret = embed_speaker + embed_sentence_length + embed_word_length + embed_sentiment + embed_uniq_rate + embed_question + embed_overlap
    SENTENCE_EMBEDDING_SIZE = len(ret)

    liwc_tagged = tag(str, features, liwc)
    counter += 1
    if counter % 1000 == 0:
        print(counter)
    return ret + liwc_tagged


def session_embedding_func(str):
    global SESSION_EMBEDDING_SIZE

    words = len(str.replace("\n", " ").split(" "))
    turns = len(str.split("\n"))
    embed_session_length = [min(len(str), 100), min(len(str), 1000), min(len(str), 5000)]
    embed_session_words = [min(words, 100), min(words, 1000), min(words, 5000)]
    embed_session_turns = [turns]
    pos_word = ["love", "friend"]
    neg_word = ["stupid", "idiot", "fuck"]
    pos = False
    neg = False
    for p in pos_word:
        if p in str:
            pos = True
            break

    for n in neg_word:
        if n in str:
            neg = True
            break

    embed_sentiment = [1 if pos else 0, 1 if neg else 0]

    embed_uniq_rate = [len(set(str.replace("\n", " ").split(" "))) / words]

    ret = embed_session_length + embed_session_words + embed_session_turns + embed_sentiment + embed_uniq_rate
    SESSION_EMBEDDING_SIZE = len(ret)
    return ret


X_sentence_aux_embedding = np.array(
    [[sentence_embedding_func(last_sentence, sentence) for last_sentence, sentence in
      zip([""] + session[:-1:], session[::])] for session in pad_sentence], dtype="float32")

X_sentence_aux_embedding /= np.max(X_sentence_aux_embedding, axis=(0, 1,))

X_session_aux_embedding = np.array([session_embedding_func(session) for session in df_rev_balanced.text.values],
                                   dtype="float32")
X_session_aux_embedding /= np.max(X_session_aux_embedding, axis=(0,))

SIZE = X_sentence_aux_embedding.shape[0]
X_sentence_aux_embedding = X_sentence_aux_embedding.reshape(SIZE, -1)



1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000
13000




In [2]:
X_sentence_aux_embedding = np.nan_to_num(X_sentence_aux_embedding, 0)

In [5]:
import pickle
pf = open("pickled.pk","wb")
X = np.concatenate([X_sentence_aux_embedding, X_session_aux_embedding], axis=1)
# X = X_session_aux_embedding
Y = df_rev_balanced.rating.values.astype(int)

pickle.dump(X, pf)
pickle.dump(Y,pf)
pf.close()

In [7]:

X_train, X_test, y_train, y_test = train_test_split(X, Y,
                                                    test_size=VALIDATION_SPLIT,
                                                    random_state=9)
from sklearn import linear_model, datasets
from sklearn import svm, datasets

logreg = linear_model.LogisticRegression(C=1e5)
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_test)
from sklearn.metrics import cohen_kappa_score
print(cohen_kappa_score(y_pred, y_test))

0.160956422731


(12157, 98)

In [12]:
X_train_small = X_train[:10000]
y_train_small = y_train[:10000]
logreg = svm.SVC(C=1e5)
logreg.fit(X_train_small, y_train_small)
y_pred = logreg.predict(X_test)
from sklearn.metrics import cohen_kappa_score
print(cohen_kappa_score(y_pred, y_test))

0.121584236172


In [13]:
from sklearn.metrics import accuracy_score
accuracy_score(y_pred, y_test)

0.62250185048112505

In [28]:
y_train

array([[ 1.,  0.],
       [ 0.,  1.],
       [ 1.,  0.],
       ..., 
       [ 0.,  1.],
       [ 1.,  0.],
       [ 0.,  1.]])

In [29]:
import sklearn

In [23]:
X_train_seten.shape

(12157, 480)

In [24]:
X_train_sessi.shape

(12157, 10)

In [35]:
y.shape

(150,)

In [34]:
y_train.argmax(axis=1).shape

(12157,)

In [36]:
Y = df_rev_balanced.rating.values.astype(int)


In [37]:
Y

array([0, 0, 0, ..., 0, 1, 0])

0.156173868807


In [59]:
from sklearn.metrics import confusion_matrix
print(confusion_matrix(y_test, y_pred))
print(accuracy_score(y_test, y_pred))

[[800  91]
 [351 109]]
0.672834937084


In [40]:
y_train

array([0, 1, 0, ..., 1, 0, 1])

In [42]:
from sklearn import linear_model, datasets

logreg = linear_model.LogisticRegression(C=1e5)


LogisticRegression(C=100000.0, class_weight=None, dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=1, penalty='l2', random_state=None,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False)

In [44]:
from sklearn.metrics import accuracy_score

In [51]:
accuracy_score(y_test, y_pred)

0.67357512953367871

In [52]:
from sklearn.metrics import confusion_matrix

In [53]:
confusion_matrix(y_pred, y_test)

array([[835, 385],
       [ 56,  75]])

0.12114980949107645