In [0]:
import pandas as pd
import numpy as np

import re

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [0]:
data = pd.read_csv("scmp_news.csv", encoding = "ISO-8859-1", index_col=0)

In [0]:
data.head()

Unnamed: 0,created_at,favorite_count,id_str,in_reply_to_screen_name,is_retweet,retweet_count,source,text
0,Wed Oct 15 10:49:07 +0000 2014,2,522338371721768961,,False,0,SocialFlow,"22,000 civil service jobs are up for grabs in ..."
1,Thu Sep 04 07:30:07 +0000 2014,3,507430393034080257,,False,5,SocialFlow,Japan names five women to new cabinet of Prime...
2,Tue Nov 05 06:31:09 +0000 2013,0,397612029126918144,,False,0,Hootsuite,RT @BloombergNews: Thai protesters rally in Ba...
3,Sat Jun 13 07:10:14 +0000 2015,1,609618764745805825,,False,3,Hootsuite,Hong Kong's Able Friend edged for favouritism ...
4,Thu May 08 23:20:12 +0000 2014,1,464545330835783680,,False,5,Hootsuite,Our map of all publicly accessible toilets in ...


In [0]:
data.text[0]

'22,000 civil service jobs are up for grabs in China next year http://t.co/vtf64hQ01Z'

In [0]:
def cleanURLs(text):
    return re.sub('http\S+','', text)

In [0]:
data["text"] = data["text"].apply(cleanURLs)

In [0]:
data.created_at[0]

'Wed Oct 15 10:49:07 +0000 2014'

In [0]:
datetime_format = "%a %b %d %H:%M:%S +0000 %Y"

In [0]:
data["created_at"] = pd.to_datetime(data.created_at, format=datetime_format, errors='ignore')

In [0]:
data = data[["created_at","text"]].sort_values(by="created_at").set_index("created_at")

In [0]:
data.head()

Unnamed: 0_level_0,text
created_at,Unnamed: 1_level_1
2009-03-13 06:56:10,@Tortue it seems @scmp was already taken ... a...
2009-03-13 12:20:03,The Modified Toy Orchestra will be playing in ...
2009-03-13 13:48:55,Our latest slideshow on Paris Fashion Week : ...
2009-03-13 14:38:37,Massive loans to the US worries Wen Jiabao:
2009-03-14 19:14:22,"Good Ba Ba, Sacred Kingdom meet in Group One r..."


In [0]:
train = data.groupby(pd.Grouper(freq='D'))['text'].apply(lambda x: "%s" % ' '.join(x))

In [0]:
train.head()

created_at
2009-03-13    @Tortue it seems @scmp was already taken ... a...
2009-03-14    Good Ba Ba, Sacred Kingdom meet in Group One r...
2009-03-15                                                     
2009-03-16    Sifu Ip Chun, son and successor of Ip Man, kee...
2009-03-17    @jeromyu well, the multimedia team here at the...
Freq: D, Name: text, dtype: object

In [0]:
train.index.rename("Date", inplace = True)

In [0]:
trainheadlines = train.values

In [0]:
basicvectorizer = CountVectorizer()
basictrain = basicvectorizer.fit_transform(trainheadlines)
print(basictrain.shape)

(3332, 37986)


In [0]:
# example3 = [x for i in trainheadlines for x in CountVectorizer().build_tokenizer()(i)]

In [0]:
# pd.DataFrame([[x,example3.count(x)] for x in set(example3)])

18945 different words

In [0]:
stock = pd.read_csv("0016.HK.csv")

In [0]:
stock = stock[(stock["Close"] != "null")]

In [0]:
stock["rise_in_next_day"] = stock["Close"].astype("float").shift(-1)/stock["Close"].astype("float") >=1

In [0]:
stock["Date"] = pd.to_datetime(stock.Date, format="%Y-%m-%d", errors='ignore')

In [0]:
stock = stock.set_index("Date")

In [0]:
stock.head()

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume,rise_in_next_day
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2009-03-03,57.75,58.799999,56.900002,58.0,43.546021,6141559,True
2009-03-04,57.5,59.599998,57.0,59.299999,44.522045,7374811,False
2009-03-05,59.549999,59.549999,57.950001,59.049999,44.334351,5885351,False
2009-03-06,58.0,58.5,56.5,56.5,42.41983,9289667,False
2009-03-09,57.349998,57.849998,55.799999,55.799999,41.894272,5106613,True


In [0]:
df = pd.merge(train.to_frame(), stock["rise_in_next_day"].to_frame(), left_index=True, right_index=True)

In [0]:
df.head()

Unnamed: 0_level_0,text,rise_in_next_day
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
2009-03-13,@Tortue it seems @scmp was already taken ... a...,True
2009-03-16,"Sifu Ip Chun, son and successor of Ip Man, kee...",False
2009-03-17,"@jeromyu well, the multimedia team here at the...",True
2009-03-18,US Newspapers may be on their deathbed: but n...,True
2009-03-19,Interview today with Josie Ho Chiu-yi (¦ó¶W»ö)...,False


In [0]:
df.shape

(2254, 2)

In [0]:
df.to_csv("0016_clean.csv")

In [0]:
train = df[df.index < '2017-01-01']
test = df[df.index > '2016-12-31']

# Basic model

In [0]:
trainheadlines = train["text"].values

basicvectorizer = CountVectorizer()
basictrain = basicvectorizer.fit_transform(trainheadlines)
print(basictrain.shape)

(1929, 29490)


In [0]:
basicmodel = LogisticRegression()
basicmodel = basicmodel.fit(basictrain, train["rise_in_next_day"])

In [0]:
testheadlines = test["text"].values

basictest = basicvectorizer.transform(testheadlines)
predictions = basicmodel.predict(basictest)

In [0]:
pd.crosstab(test["rise_in_next_day"], predictions, rownames=["Actual"], colnames=["Predicted"])

Predicted,False,True
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1
False,76,74
True,71,104


In [0]:
acc=accuracy_score(test['rise_in_next_day'], predictions)
acc

0.5538461538461539

In [0]:
basicwords = basicvectorizer.get_feature_names()
basiccoeffs = basicmodel.coef_.tolist()[0]
coeffdf = pd.DataFrame({'Word' : basicwords, 
                        'Coefficient' : basiccoeffs})
coeffdf = coeffdf.sort_values(['Coefficient', 'Word'], ascending=[0, 1])
coeffdf.head(10)

Unnamed: 0,Coefficient,Word
15568,0.735311,link
5540,0.707042,cities
17752,0.682269,needed
4347,0.648645,build
15274,0.566926,left
5123,0.562152,change
14725,0.558754,killed
26067,0.555549,technology
22849,0.551476,save
28259,0.550572,wan


In [0]:
coeffdf.tail(10)

Unnamed: 0,Coefficient,Word
6376,-0.544537,control
8312,-0.552323,dollar
13892,-0.558049,ipad
26323,-0.559127,think
26581,-0.58619,told
12547,-0.603809,hknews
10875,-0.612136,full
14055,-0.613675,jailed
12464,-0.616442,hits
20628,-0.715491,public


# Advanced model

In [0]:
trainheadlines = train["text"].values

advancedvectorizer = CountVectorizer(ngram_range=(2,2))
advancedtrain = advancedvectorizer.fit_transform(trainheadlines)

In [0]:
print(advancedtrain.shape)

(1929, 308815)


In [0]:
advancedmodel = LogisticRegression()
advancedmodel = advancedmodel.fit(advancedtrain, train["rise_in_next_day"])

In [0]:
testheadlines = test["text"]

advancedtest = advancedvectorizer.transform(testheadlines)
advpredictions = advancedmodel.predict(advancedtest)

In [0]:
pd.crosstab(test["rise_in_next_day"], advpredictions, rownames=["Actual"], colnames=["Predicted"])

Predicted,False,True
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1
False,79,71
True,98,77


In [0]:
acc=accuracy_score(test['rise_in_next_day'], advpredictions)
acc

0.48

In [0]:
advwords = advancedvectorizer.get_feature_names()
advcoeffs = advancedmodel.coef_.tolist()[0]
advcoeffdf = pd.DataFrame({'Words' : advwords, 
                        'Coefficient' : advcoeffs})
advcoeffdf = advcoeffdf.sort_values(['Coefficient', 'Words'], ascending=[0, 1])
advcoeffdf.head(10)

Unnamed: 0,Coefficient,Words
104439,0.464142,for hk
271650,0.36446,this is
306931,0.349255,you for
273634,0.34309,tips for
268254,0.332592,the great
170812,0.331978,may be
271808,0.314365,this year
68014,0.304869,could be
99712,0.296328,find out
56838,0.290905,china japan


In [0]:
advcoeffdf.tail(10)

Unnamed: 0,Coefficient,Words
293288,-0.292435,warns against
21519,-0.292937,are more
58092,-0.294024,china warns
1616,-0.298262,19 jan
193376,-0.298262,on 19
181430,-0.298662,national education
177501,-0.333654,more about
156988,-0.335926,last year
147325,-0.407522,jailed for
253308,-0.449226,stanley ho


# CCL

In [0]:
ccl = pd.read_csv("CCL.csv", sep="\t")
ccl.head()

Unnamed: 0,Date,CCL
0,2017/12/25-2017/12/31,165.02
1,2017/12/18-2017/12/24,165.62
2,2017/12/11-2017/12/17,165.3
3,2017/12/04-2017/12/10,164.46
4,2017/11/27-2017/12/03,163.29


In [0]:
def sunday(s):
    return s.split("-")[1]

ccl["Date"] = ccl["Date"].apply(sunday)

In [0]:
ccl["Date"] = pd.to_datetime(ccl.Date, format="%Y/%m/%d", errors='ignore')

In [0]:
ccl.set_index("Date", inplace=True)

In [0]:
ccl["rise_in_next_week"] = (ccl["CCL"].shift(-1)/ccl["CCL"] >= 1).astype("int")

In [0]:
train = data.groupby(pd.Grouper(freq='W'))['text'].apply(lambda x: "%s" % ' '.join(x))
train.tail()

created_at
2018-04-01    China wants to build the brains behind 30 mill...
2018-04-08    Sony's new CEO faces a tough task: rekindling ...
2018-04-15    Chinese investors in Malaysia will face more s...
2018-04-22    - China and Japan join forces on North Korea\r...
2018-04-29    U.S. is hinting at a China truce as the world ...
Freq: W-SUN, Name: text, dtype: object

In [0]:
df = pd.merge(train.to_frame(), ccl["rise_in_next_week"].to_frame(), left_index=True, right_index=True)

In [0]:
df.head()

Unnamed: 0,text,rise_in_next_week
2009-03-15,@Tortue it seems @scmp was already taken ... a...,0
2009-03-22,"Sifu Ip Chun, son and successor of Ip Man, kee...",1
2009-03-29,Paging all twitter folk for questions for Oliv...,0
2009-04-05,Were you caught on camera at the Sevens this w...,1
2009-04-12,US Senator John McCain prods China on N Korea...,0


In [0]:
df.to_csv("ccl_clean.csv")

In [0]:
train = df[df.index < '2014-01-01']
test = df[df.index > '2013-12-31']

In [0]:
trainheadlines = train["text"].values

basicvectorizer = CountVectorizer()
basictrain = basicvectorizer.fit_transform(trainheadlines)
print(basictrain.shape)

(251, 18016)


In [0]:
basicmodel = LogisticRegression()
basicmodel = basicmodel.fit(basictrain, train["rise_in_next_week"])

In [0]:
testheadlines = test["text"].values

basictest = basicvectorizer.transform(testheadlines)
predictions = basicmodel.predict(basictest)

In [0]:
pd.crosstab(test["rise_in_next_week"], predictions, rownames=["Actual"], colnames=["Predicted"])

Predicted,0,1
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1
0,42,81
1,27,59


In [0]:
acc=accuracy_score(test['rise_in_next_week'], predictions)
acc

0.48325358851674644

In [0]:
basicwords = basicvectorizer.get_feature_names()
basiccoeffs = basicmodel.coef_.tolist()[0]
coeffdf = pd.DataFrame({'Word' : basicwords, 
                        'Coefficient' : basiccoeffs})
coeffdf = coeffdf.sort_values(['Coefficient', 'Word'], ascending=[0, 1])
coeffdf.head(10)

Unnamed: 0,Coefficient,Word
7020,0.3706,great
16406,0.359206,treasures
5619,0.34803,escape
13452,0.339272,retracing
6406,0.278116,for
10816,0.275534,news
16036,0.260446,their
4327,0.228463,death
1374,0.227397,art
1842,0.227083,be


In [0]:
coeffdf.tail(10)

Unnamed: 0,Coefficient,Word
1911,-0.197422,beijing
14003,-0.199593,schools
3920,-0.207629,could
14016,-0.212326,scmp
17254,-0.236449,want
1472,-0.23749,at
14143,-0.244615,see
16179,-0.248903,tips
11282,-0.258076,our
16892,-0.260164,us


In [0]:
trainheadlines = train["text"].values

advancedvectorizer = CountVectorizer(ngram_range=(2,2))
advancedtrain = advancedvectorizer.fit_transform(trainheadlines)

In [0]:
print(advancedtrain.shape)

(251, 136005)


In [0]:
advancedmodel = LogisticRegression()
advancedmodel = advancedmodel.fit(advancedtrain, train["rise_in_next_week"])

In [0]:
testheadlines = test["text"]

advancedtest = advancedvectorizer.transform(testheadlines)
advpredictions = advancedmodel.predict(advancedtest)

In [0]:
pd.crosstab(test["rise_in_next_week"], advpredictions, rownames=["Actual"], colnames=["Predicted"])

Predicted,0,1
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1
0,76,47
1,34,52


In [0]:
acc=accuracy_score(test['rise_in_next_week'], advpredictions)
acc

0.6124401913875598

In [0]:
advwords = advancedvectorizer.get_feature_names()
advcoeffs = advancedmodel.coef_.tolist()[0]
advcoeffdf = pd.DataFrame({'Words' : advwords, 
                        'Coefficient' : advcoeffs})
advcoeffdf = advcoeffdf.sort_values(['Coefficient', 'Words'], ascending=[0, 1])
advcoeffdf.head(10)

Unnamed: 0,Coefficient,Words
118055,0.299876,the great
9197,0.294691,art treasures
38954,0.294691,escape of
50741,0.294691,great escape
100414,0.294691,retracing the
23568,0.293752,china art
82555,0.238028,of china
85541,0.20452,on mainland
2449,0.1979,abonel dserjeant
32522,0.1979,deeseebeaucoup geoffrey_wu


In [0]:
advcoeffdf.tail(10)

Unnamed: 0,Coefficient,Words
132590,-0.09719,with the
45134,-0.098562,for hk
135040,-0.098693,you re
80933,-0.098867,nifty futures
52217,-0.105802,happy valley
118290,-0.108932,the mainland
98836,-0.118984,register at
87469,-0.121895,out the
59380,-0.124855,in hk
60089,-0.153772,in the


# HSI

In [0]:
hsi = pd.read_csv("HSI.csv")
hsi.head()

Unnamed: 0,Date,Price,Open,High,Low,Vol.,Change %
0,"May 03, 2018",30163.0,30265.0,30300.5,30078.5,-,-1.83%
1,"May 02, 2018",30723.88,30782.21,30824.77,30570.81,1.69B,-0.27%
2,"Apr 30, 2018",30808.45,30530.4,30853.13,30483.91,1.98B,1.74%
3,"Apr 27, 2018",30280.67,30300.83,30336.0,30019.12,1.66B,0.91%
4,"Apr 26, 2018",30007.68,30378.15,30463.42,29871.2,1.82B,-1.06%


In [0]:
def rmprecent(s):
    return s.split("%")[0]
hsi["rise_nextday"] = (hsi["Change %"].apply(rmprecent).astype("float") >= 1).astype("int")

In [0]:
hsi["Date"] = pd.to_datetime(hsi.Date, format="%b %d, %Y", errors='ignore')
hsi = hsi.set_index("Date")

In [0]:
df = pd.merge(train.to_frame(), hsi["rise_nextday"].to_frame(), left_index=True, right_index=True)

In [0]:
df.head()

Unnamed: 0,text,rise_nextday
2009-03-13,@Tortue it seems @scmp was already taken ... a...,1
2009-03-16,"Sifu Ip Chun, son and successor of Ip Man, kee...",1
2009-03-17,"@jeromyu well, the multimedia team here at the...",0
2009-03-18,US Newspapers may be on their deathbed: but n...,1
2009-03-19,Interview today with Josie Ho Chiu-yi (¦ó¶W»ö)...,0


In [0]:
train = df[df.index < '2017-01-01']
test = df[df.index > '2016-12-31']

In [0]:
trainheadlines = train["text"].values

basicvectorizer = CountVectorizer()
basictrain = basicvectorizer.fit_transform(trainheadlines)
print(basictrain.shape)

(1934, 29472)


In [0]:
basicmodel = LogisticRegression()
basicmodel = basicmodel.fit(basictrain, train["rise_nextday"])

In [0]:
testheadlines = test["text"].values

basictest = basicvectorizer.transform(testheadlines)
predictions = basicmodel.predict(basictest)

In [0]:
pd.crosstab(test["rise_nextday"], predictions, rownames=["Actual"], colnames=["Predicted"])

Predicted,0,1
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1
0,268,14
1,42,1


In [0]:
acc=accuracy_score(test['rise_nextday'], predictions)
acc

0.8276923076923077

In [0]:
basicwords = basicvectorizer.get_feature_names()
basiccoeffs = basicmodel.coef_.tolist()[0]
coeffdf = pd.DataFrame({'Word' : basicwords, 
                        'Coefficient' : basiccoeffs})
coeffdf = coeffdf.sort_values(['Coefficient', 'Word'], ascending=[0, 1])
coeffdf.head(10)

Unnamed: 0,Coefficient,Word
5540,0.678828,cities
7253,0.648568,death
10968,0.645852,gain
28932,0.60611,wu
19514,0.589999,photos
25339,0.557993,success
12852,0.540911,housing
22133,0.529049,rice
9366,0.520561,estate
17663,0.502445,national


In [0]:
coeffdf.tail(10)

Unnamed: 0,Coefficient,Word
26878,-0.474506,treat
27118,-0.47764,turns
3476,-0.495035,bid
5323,-0.503683,chief
6963,-0.509164,cuts
20815,-0.540981,quake
26295,-0.554197,they
23507,-0.594936,she
8555,-0.637324,drop
9850,-0.792279,fall


In [0]:
trainheadlines = train["text"].values

advancedvectorizer = CountVectorizer(ngram_range=(2,2))
advancedtrain = advancedvectorizer.fit_transform(trainheadlines)

In [0]:
advancedmodel = LogisticRegression()
advancedmodel = advancedmodel.fit(advancedtrain, train["rise_nextday"])

In [0]:
testheadlines = test["text"]

advancedtest = advancedvectorizer.transform(testheadlines)
advpredictions = advancedmodel.predict(advancedtest)

In [0]:
pd.crosstab(test["rise_nextday"], advpredictions, rownames=["Actual"], colnames=["Predicted"])

Predicted,0
Actual,Unnamed: 1_level_1
0,282
1,43


In [0]:
acc=accuracy_score(test['rise_nextday'], advpredictions)
acc

0.8676923076923077

In [0]:
advwords = advancedvectorizer.get_feature_names()
advcoeffs = advancedmodel.coef_.tolist()[0]
advcoeffdf = pd.DataFrame({'Words' : advwords, 
                        'Coefficient' : advcoeffs})
advcoeffdf = advcoeffdf.sort_values(['Coefficient', 'Words'], ascending=[0, 1])
advcoeffdf.head(10)

Unnamed: 0,Coefficient,Words
273605,0.420108,to be
87026,0.349972,education subject
257396,0.34339,subject to
110773,0.337257,futures up
32189,0.335682,be delayed
297533,0.309945,why how
232746,0.30709,sacred kingdom
291286,0.305944,vote to
181091,0.304426,national education
94061,0.301357,extended to


In [0]:
advcoeffdf.tail(10)

Unnamed: 0,Coefficient,Words
135115,-0.294563,in india
18365,-0.298689,and the
7280,-0.303027,accused of
110730,-0.303227,futures down
11724,-0.313705,ai weiwei
36550,-0.32045,bid to
306321,-0.331299,you can
254983,-0.350723,stocks fall
136353,-0.388615,in this
144749,-0.426521,is the
