**PREDICTING STOCK MARKET TRENDS THROUGH SOCIAL MEDIA**

ADITYA SHIRODKAR

A20332644

All the data used for this project is available at http://cloud.aditya11.com

Username: twitter

Password: twitter

In [None]:
from collections import Counter
import glob
import hashlib
import io
import math
import matplotlib.pyplot as plt
import numpy as np
import os
import re
from sklearn.cross_validation import KFold
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from urllib import urlretrieve
%matplotlib inline

In [None]:
def get_files(path):
    """ Return a list of file names in this directory that end in .txt 
    The list should be sorted alphabetically by file name.
    Params:
        path....a directory containing .txt review files.
    Returns:
        a list of .txt file names, sorted alphabetically.
    """
    return [f for f in glob.glob(path + os.sep + "*.txt")]

In [None]:
all_train_files = get_files('train/tweets')

In [None]:
def file2tweets(filename):
    return io.open(filename, encoding='utf8').readlines()

In [None]:
temp = []
for tweets in all_train_files:
    temp.append(len(file2tweets(tweets)))

OY = []
total = 0
for i in range(1, len(temp) + 1):
    total += temp[i-1]
    if i % 8 == 0:
        OY.append(total)
        total = 0

In [None]:
import numpy as np
from matplotlib import pyplot as plt

fig = plt.figure()

width = .35
OX = ["02NOV", "03NOV", "04NOV", "05NOV", "06NOV", "09NOV", "10NOV", "12NOV", "16NOV", "17NOV", "18NOV", "19NOV", "20NOV", "23NOV", "24NOV", "26NOV", "27NOV"]
ind = np.arange(len(OY))
plt.bar(ind, OY)
plt.xticks(ind + width / 2, OX)
plt.xlabel('DATE')
plt.ylabel('NO. OF TWEETS')

fig.autofmt_xdate()

In [None]:
from pylab import *
from matplotlib.finance import candlestick, quotes_historical_yahoo

# (Year, Month, Day)
FROM = (2015, 11, 02)
TO = (2015, 11, 27)

# (DATE, OPEN, CLOSE, HIGH, LOW, VOLUME)
quotes = quotes_historical_yahoo('DJIA', FROM, TO)

if len(quotes) == 0:
    raise SystemExit

fig = figure()
fig.subplots_adjust(bottom=0.2)
ax = fig.add_subplot(111)

candlestick(ax, quotes, width=0.8)

ax.xaxis_date()
ax.autoscale_view()
setp( gca().get_xticklabels(), rotation=45, horizontalalignment='right')

show()

In [None]:
def tokenize(text):
    """Given a string, return a list of tokens such that: (1) all
    tokens are lowercase, (2) all punctuation is removed. Note that
    underscore (_) is not considered punctuation.
    UPDATE: To be more specific, a token is a sequence of 
    alphanumeric characters, i.e., [A-Za-z0-9_]. Non-ascii characters
    are not considered to be part of tokens.
    Params:
        text....a string
    Returns:
        a list of tokens
    """
    return [x.lower() for x in re.findall(r"\w+", text)]

In [None]:
def do_vectorize(filenames, tokenizer_fn=tokenize, min_df=1,
                 max_df=1., binary=True, ngram_range=(1,1)):
    """
    Convert a list of filenames into a sparse csr_matrix, where
    each row is a file and each column represents a unique word.
    Use sklearn's CountVectorizer: http://goo.gl/eJ2PJ5
    Params:
        filenames.......list of review file names
        tokenizer_fn....the function used to tokenize each document
        min_df..........remove terms from the vocabulary that don't appear
                        in at least this many documents
        max_df..........remove terms from the vocabulary that appear in more
                        than this fraction of documents
        binary..........If true, each documents is represented by a binary
                        vector, where 1 means a term occurs at least once in 
                        the document. If false, the term frequency is used instead.
        ngram_range.....A tuple (n,m) means to use phrases of length n to m inclusive.
                        E.g., (1,2) means consider unigrams and bigrams.
    Return:
        A tuple (X, vec), where X is the csr_matrix of feature vectors,
        and vec is the CountVectorizer object.
    """
    vec = CountVectorizer(input = 'filename', tokenizer = tokenizer_fn, min_df = min_df, max_df = max_df, binary = binary, ngram_range  = ngram_range, dtype = int)
    X = vec.fit_transform(filenames)
    
    return (X, vec)
    
matrix, vec = do_vectorize(all_train_files, binary = False)
print ('matrix represents %d documents with %d features' % (matrix.shape[0], matrix.shape[1]))

In [None]:
# Do not modify. This creates a LogsticRegression object, which
# you will use in the do_cross_validation method below.
def get_clf():
    from sklearn import svm
    return svm.SVC(probability=True)

In [None]:
labels = np.array([l.strip() for l in open("train/train_y.txt", 'r').readlines()])

In [None]:
def repeatable_random(seed):
    hash = str(seed)
    while True:
        hash = hashlib.md5(hash).digest()
        for c in hash:
            yield ord(c)

def repeatable_shuffle(X, y, filenames):
    r = repeatable_random(42) 
    indices = sorted(range(X.shape[0]), key=lambda x: next(r))
    return X[indices], y[indices], np.array(filenames)[indices]

X, y, filenames = repeatable_shuffle(matrix, labels, all_train_files)

In [None]:
def do_cross_validation(X, y, n_folds=5, verbose=False):
    """
    Perform n-fold cross validation, calling get_clf() to train n
    different classifiers. Use sklearn's KFold class: http://goo.gl/wmyFhi
    Be sure not to shuffle the data, otherwise your output will differ.
    Params:
        X.........a csr_matrix of feature vectors
        y.........the true labels of each document
        n_folds...the number of folds of cross-validation to do
        verbose...If true, report the testing accuracy for each fold.
    Return:
        the average testing accuracy across all folds.
    """
    cv = KFold(len(y), n_folds)
    accuracies = []
    n = 0
    for train_idx, test_idx in cv:
        clf = get_clf()
        clf.fit(X[train_idx], y[train_idx])
        predicted = clf.predict(X[test_idx])
        acc = accuracy_score(y[test_idx], predicted)
        accuracies.append(acc)
        if verbose:
            print('fold %d accuracy=%.4f' %(n, acc))
        n += 1
    return np.mean(accuracies)
    
print('average cross validation accuracy=%.4f' %
      do_cross_validation(X, y, n_folds=5, verbose=True))

In [None]:
from sklearn import metrics

predictedLabels = []
clf = get_clf()
clf.fit(X, y)
m = clf.predict_proba(X)
for pos, neg in m:
    if pos > neg:
        predictedLabels.append('BEARISH')
    else:
        predictedLabels.append('BULLISH')
        
print(metrics.classification_report(y, np.array(predictedLabels)))
print(metrics.confusion_matrix(y, np.array(predictedLabels)))