In [None]:
# Keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Flatten, LSTM, Conv1D, MaxPooling1D, Dropout, Activation
from keras.layers.embeddings import Embedding

## Plot
import plotly.offline as py
import plotly.graph_objs as go
py.init_notebook_mode(connected=True)
import matplotlib as plt

# NLTK
import nltk
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
nltk.download('stopwords')

# Other
import re
import string
import numpy as np
import pandas as pd
from sklearn.manifold import TSNE


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


#Смотрим на данные#

In [None]:
!unzip '/content/drive/MyDrive/games.zip'

Archive:  /content/drive/MyDrive/games.zip
  inflating: games/reviews_test.csv  
  inflating: games/reviews_train.csv  
  inflating: games/sample_submission.csv  


In [None]:
data = pd.read_csv("/content/games/reviews_train.csv").drop('id',axis=1)

In [None]:
df = pd.read_csv("/content/games/reviews_train.csv").drop('id',axis=1)

In [None]:
data.head

<bound method NDFrame.head of        Unnamed: 0                                             review
0          638820  **DISCLAIMER** I received a review key from th...
1          644537                                Early Access Review
2          604237  First of all, I love dual stick shooters, and ...
3          670182  Love the game so far. Thanks to Steam for givi...
4          681107                                Early Access Review
...           ...                                                ...
44995      576936  Everyone else says it. Broken mechanics, same ...
44996      621717  Whoever gives this game a negative review beca...
44997      637230       My most played game on steam at moment!  5/5
44998      689339  My review of infinite well it is a VERY VERY g...
44999      585625  Dull, Limited Game PLay. even £3.99 was to muc...

[45000 rows x 2 columns]>

In [None]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45000 entries, 0 to 44999
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  45000 non-null  int64 
 1   review      44961 non-null  object
dtypes: int64(1), object(1)
memory usage: 703.2+ KB


#Чистим текст#

In [None]:
def clean_text(text):
    
    ## Remove puncuation
    text = text.translate(string.punctuation)
    
    ## Convert words to lower case and split them
    text = text.lower().split()
    
    ## Remove stop words
    stops = set(stopwords.words("english"))
    text = [w for w in text if not w in stops and len(w) >= 3]
    
    text = " ".join(text)

    # Clean the text
    text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", text)
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r",", " ", text)
    text = re.sub(r"\.", " ", text)
    text = re.sub(r"!", " ! ", text)
    text = re.sub(r"\/", " ", text)
    text = re.sub(r"\^", " ^ ", text)
    text = re.sub(r"\+", " + ", text)
    text = re.sub(r"\-", " - ", text)
    text = re.sub(r"\=", " = ", text)
    text = re.sub(r"'", " ", text)
    text = re.sub(r"(\d+)(k)", r"\g<1>000", text)
    text = re.sub(r":", " : ", text)
    text = re.sub(r" e g ", " eg ", text)
    text = re.sub(r" b g ", " bg ", text)
    text = re.sub(r" u s ", " american ", text)
    text = re.sub(r"\0s", "0", text)
    text = re.sub(r" 9 11 ", "911", text)
    text = re.sub(r"e - mail", "email", text)
    text = re.sub(r"j k", "jk", text)
    text = re.sub(r"\s{2,}", " ", text)
    
    text = text.split()
    stemmer = SnowballStemmer('english')
    stemmed_words = [stemmer.stem(word) for word in text]
    text = " ".join(stemmed_words)

    return text

Заполняем NaN`ы пустыми строками

In [None]:
data['review']= data.fillna({'review':''})

In [None]:
labels = data['like'].map(lambda x : 1 if int(x) > 0 else 0)

In [None]:
data['review'] = data['review'].map(lambda x: clean_text(x))

In [None]:
data.head(10)

Unnamed: 0,review,like
0,kind game gift friend annoy them glitch part g...,-1
1,earli access review,1
2,favourit game ever must play,1
3,tate decay put urviv zombi surviv genr peopl g...,1
4,realli good game must buy dlc realli enjoy it,1
5,earli access review,1
6,earli access review,1
7,earli access review,-1
8,10 10 best game ever no rage funni goal save e...,1
9,do not you bethesda id god thank you love you,1


In [None]:
vocabulary_size = 20000
tokenizer = Tokenizer(num_words= vocabulary_size)
tokenizer.fit_on_texts(data['review'])

sequences = tokenizer.texts_to_sequences(data['review'])
data = pad_sequences(sequences, maxlen=50)

In [None]:
print(data.shape)

(563131, 50)


##Embendings##

In [None]:
embeddings_index = dict()
f = open('glove.6B/glove.6B.100d.txt')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()
print('Loaded %s word vectors.' % len(embeddings_index))

In [None]:
# create a weight matrix for words in training docs
embedding_matrix = np.zeros((vocabulary_size, 100))
for word, index in tokenizer.word_index.items():
    if index > vocabulary_size - 1:
        break
    else:
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[index] = embedding_vector

##LSTM##

In [None]:
model_lstm = Sequential()
model_lstm.add(Embedding(20000, 100, input_length=50))
model_lstm.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
model_lstm.add(Dense(1, activation='sigmoid'))
model_lstm.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
labels = df['like'].map(lambda x : 1 if int(x) > 0 else 0)

In [None]:
model_lstm.fit(data, np.array(labels), validation_split=0.4, epochs=6)

Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6


<keras.callbacks.History at 0x7fb0da17ba90>

#Получаем предсказание модели#

In [None]:
test_data = pd.read_csv("/content/games/reviews_test.csv").drop('Unnamed: 0',axis=1)

In [None]:
test_data['review']= test_data.fillna({'review':''})

In [None]:
test_data['review'] = test_data['review'].map(lambda x: clean_text(x))

In [None]:
vocabulary_size = 20000
tokenizer = Tokenizer(num_words= vocabulary_size)
tokenizer.fit_on_texts(test_data['review'])

sequences = tokenizer.texts_to_sequences(test_data['review'])
test_data = pad_sequences(sequences, maxlen=50)

In [None]:
y_test = model_lstm.predict(test_data)

In [None]:
import csv

with open('answers.csv', 'w', newline='') as csvfile:
    writer = csv.writer(csvfile, delimiter=',')
    writer.writerow('0')
    for row in range(0,y_test.shape[0]):
        myList = []
        myList.append(y_test[row])
        writer.writerow(myList)

In [None]:
!cp "/content/answers_games.csv" "/content/drive/My Drive/"
/content/sample_data/README.md

cp: cannot stat '/content/answers_games.csv': No such file or directory


#Обработка данных#

In [None]:
data = pd.read_csv('/content/answers.csv')

In [None]:
data['0'] = data['0'].str[1:-1]

In [None]:
data['0'] = pd.to_numeric(data['0'])

In [None]:
data['0'] = data['0'].map(lambda x : 1 if float(x) >= 0.5 else -1)

In [None]:
data.head(10)

Unnamed: 0,0
0,1
1,1
2,-1
3,-1
4,1
5,1
6,1
7,1
8,1
9,-1


In [None]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45000 entries, 0 to 44999
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   0       45000 non-null  int64
dtypes: int64(1)
memory usage: 351.7 KB


In [None]:
data.to_csv('ans.csv', index=False, header='label')