In [1]:
import pandas as pd
import os
import nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

from nltk.corpus.reader.plaintext import PlaintextCorpusReader
from nltk.corpus.reader.util import read_line_block
from nltk.corpus import stopwords
from gensim.models import Word2Vec

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/kasperipalkama/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/kasperipalkama/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/kasperipalkama/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
# load data
filename = 'subreddit_travel_may2015.csv'
wd = os.getcwd()
data_dir = os.path.join(wd, 'data')
raw_data = pd.read_csv(os.path.join(data_dir, filename), header=None, skiprows=1)

In [3]:
# preprocess
filename_pp = 'preprocessed.csv'
data = raw_data[raw_data.iloc[:, 0] != '[deleted]']
data.to_csv(os.path.join(data_dir, filename_pp), header=False, index=False)

In [4]:
# create corpus
newcorpus = PlaintextCorpusReader(data_dir, fileids=filename_pp, para_block_reader=read_line_block)

In [5]:
# preprocessing corpus -- running time intentionally not optimized to make experimenting more convenient
def stemming(word_list, stemmer=nltk.PorterStemmer()):
    return [stemmer.stem(w) for w in word_list]

def lemmatizing(word_list, lemmatizer=nltk.WordNetLemmatizer()):
    return [lemmatizer.lemmatize(w) for w in word_list]

def to_lower(word_list):
    return [w.lower() for w in word_list]

def remove_punctuation(word_list):
    return [w for w in word_list if w.isalpha()]

def remove_stopwords(word_list):
    return [w for w in word_list if not w in stopwords.words('english')]


def clean_sentences(word_list):
    return remove_stopwords(
        remove_punctuation(
        to_lower(
        lemmatizing(word_list))))

sentences = list(map(lambda x: clean_sentences(x), newcorpus.sents()))

In [6]:
n_sentences = 10
for raw_s, s in zip(newcorpus.sents()[0:n_sentences], sentences[0:n_sentences]):
    print(raw_s)
    print(s)

['I', 'was', 'there', 'in', 'February', 'except', 'there', 'was', 'a', 'LOT', 'more', 'ice', '!']
['wa', 'february', 'except', 'wa', 'lot', 'ice']
['"', 'I', 'get', 'what', 'you', 'said', ',', 'I', "'", 'm', 'just', 'letting', 'you', 'know', 'you', 'are', 'incorrect', '."']
['get', 'said', 'letting', 'know', 'incorrect']
['"', 'Be', 'very', 'careful', 'of', 'being', 'hit', 'with', 'an', 'airport', 'exit', 'tax', '.']
['careful', 'hit', 'airport', 'exit', 'tax']
['I', 'don', "'", 't', 'know', 'what', 'the', 'situation', 'is', 'like', 'today', ',', 'but', 'as', 'recently', 'as', 'one', 'year', 'ago', 'it', 'was', 'about', '$', '56', '."']
['know', 'situation', 'like', 'today', 'recently', 'one', 'year', 'ago', 'wa']
['"', 'I', 'don', "'", 't', 'know', 'how', 'into', 'boarding', 'you', 'are', ',', 'but', 'I', 'have', 'some', 'shock', 'pads', 'and', 'soft', 'wheels', 'to', 'put', 'on', 'my', 'board', 'to', 'smoothen', 'out', 'the', 'trip', '.']
['know', 'boarding', 'shock', 'pad', 'soft', 

In [7]:
# Word2Vec
model = Word2Vec(sentences, size=100, window=5, min_count=1, workers=4)

In [8]:
model.train(sentences=sentences, total_examples=model.corpus_count, epochs=1)

(636280, 676150)

In [10]:
# Predict
model.wv.similar_by_word('thailand')

[('cambodia', 0.9737130403518677),
 ('vietnam', 0.9609863758087158),
 ('spain', 0.9257575869560242),
 ('india', 0.9173445105552673),
 ('japan', 0.9119035005569458),
 ('italy', 0.911872148513794),
 ('germany', 0.9074152708053589),
 ('korea', 0.9055827260017395),
 ('southern', 0.8994117975234985),
 ('greece', 0.8949986696243286)]