In [1]:
#! Python3
# by Jacob Kovach
# Confidential and Proprietary

import numpy as np, pandas as pd, urllib.request, pickle, spacy, re, os
from nltk import tokenize
from collections import Counter

with open('/Users/jkovach/Downloads/earnings-call-transcripts/_call_df_500', 'rb') as file:
    earnings_df = pickle.load(file)
earnings_df = earnings_df.drop(['datetime', 'filename', 'ticker', 'raw', 'header_check', 
                                'footer_check', 'price_delta', 'duration'], 1)
print(earnings_df.shape)
earnings_df.head()

(425, 2)


Unnamed: 0,content,Movement
0,ladies gentleman stand begin good welcome delt...,1
1,stand begin good lady gentleman welcome jp mor...,1
2,hello welcome earning review chief executive o...,1
3,welcome bank america earnings announcement tim...,2
4,good conference operator time like welcome wel...,1


In [92]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer

sample_split=0.3
X_train, X_test = train_test_split(earnings_df['content'], test_size=sample_split, random_state=0)

vectorizer = TfidfVectorizer(max_df=0.4, 
                             min_df=0.1, 
                             stop_words='english', 
                             lowercase=True, 
                             use_idf=True,
                             norm=u'l2', 
                             smooth_idf=True,
                             ngram_range=(1,3),
                            )

#Applying the vectorizer
earnings_tfidf = vectorizer.fit_transform(earnings_df['content']).toarray()
labels = earnings_df['Movement']
terms = vectorizer.get_feature_names()
print("Number of features: %d" % len(earnings_tfidf))

"""#splitting into training and test sets
X_train_tfidf, X_test_tfidf= train_test_split(earnings_tfidf, test_size=sample_split, random_state=0)

#Reduce the feature space from 1379 to 225.
svd = TruncatedSVD(250)
lsa = make_pipeline(svd, Normalizer(copy=False))
# Run SVD on the training data, then project the training data.
X_train_lsa = lsa.fit_transform(X_train_tfidf)

variance_explained=svd.explained_variance_ratio_
total_variance = variance_explained.sum()
print("Percent variance captured by all components:", total_variance*100)

#Looking at what sorts of paragraphs our solution considers similar, for the first five identified topics
paras_by_component=pd.DataFrame(X_train_lsa,index=X_train)
paras_by_component=paras_by_component.merge(earnings_df, on='content')
paras_by_component.head()"""

Number of features: 425


'#splitting into training and test sets\nX_train_tfidf, X_test_tfidf= train_test_split(earnings_tfidf, test_size=sample_split, random_state=0)\n\n#Reduce the feature space from 1379 to 225.\nsvd = TruncatedSVD(250)\nlsa = make_pipeline(svd, Normalizer(copy=False))\n# Run SVD on the training data, then project the training data.\nX_train_lsa = lsa.fit_transform(X_train_tfidf)\n\nvariance_explained=svd.explained_variance_ratio_\ntotal_variance = variance_explained.sum()\nprint("Percent variance captured by all components:", total_variance*100)\n\n#Looking at what sorts of paragraphs our solution considers similar, for the first five identified topics\nparas_by_component=pd.DataFrame(X_train_lsa,index=X_train)\nparas_by_component=paras_by_component.merge(earnings_df, on=\'content\')\nparas_by_component.head()'

In [93]:
from imblearn.over_sampling import RandomOverSampler, SMOTE
from imblearn.under_sampling import RandomUnderSampler

sampler_state = 42
ros = RandomOverSampler(random_state=sampler_state)
rus = RandomUnderSampler(random_state=sampler_state)
sm = SMOTE(random_state=sampler_state)

"""y = paras_by_component['Movement']
X = paras_by_component.drop(['content', 'Movement'],1)"""
print("Class Balance: {}".format(sorted(Counter(labels).items())))

X_train, X_test, y_train, y_test = train_test_split(earnings_tfidf, labels,
                                                    test_size=sample_split,
                                                    random_state=0)


X_train_res, y_train_res = ros.fit_sample(X_train, y_train)

print("Class Balance: {}".format(sorted(Counter(y_train_res).items())))
print("Class Balance: {}".format(sorted(Counter(y_test).items())))

Class Balance: [(0, 106), (1, 227), (2, 92)]
Class Balance: [(0, 158), (1, 158), (2, 158)]
Class Balance: [(0, 31), (1, 69), (2, 28)]


In [94]:
from sklearn import ensemble
from sklearn.feature_selection import SelectFromModel
from sklearn import metrics 

rfc = ensemble.RandomForestClassifier(n_estimators=200)
rfc.fit(X_train_res, y_train_res)

y_pred = rfc.predict(X_test)

print('Training set score:', rfc.score(X_train_res, y_train_res))
print('\nTest set score:', rfc.score(X_test, y_test))
print(metrics.classification_report(y_test, y_pred))
print(metrics.confusion_matrix(y_test, y_pred))

Training set score: 1.0

Test set score: 0.53125
              precision    recall  f1-score   support

           0       0.33      0.10      0.15        31
           1       0.55      0.90      0.69        69
           2       0.43      0.11      0.17        28

    accuracy                           0.53       128
   macro avg       0.44      0.37      0.34       128
weighted avg       0.47      0.53      0.44       128

[[ 3 26  2]
 [ 5 62  2]
 [ 1 24  3]]


In [95]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(solver='saga', penalty='elasticnet', l1_ratio=0.8)
train = lr.fit(X_train_res, y_train_res)
y_pred = lr.predict(X_test)

print('Training set score:', lr.score(X_train_res, y_train_res))
print('\nTest set score:', lr.score(X_test, y_test))
print(metrics.classification_report(y_test, y_pred))
print(metrics.confusion_matrix(y_test, y_pred))



Training set score: 0.6286919831223629

Test set score: 0.46875
              precision    recall  f1-score   support

           0       0.31      0.35      0.33        31
           1       0.65      0.54      0.59        69
           2       0.34      0.43      0.38        28

    accuracy                           0.47       128
   macro avg       0.43      0.44      0.43       128
weighted avg       0.50      0.47      0.48       128

[[11 11  9]
 [18 37 14]
 [ 7  9 12]]


In [96]:
params = {'n_estimators': 1000,
          'max_depth': 5,
          'loss': 'deviance'}

clf = ensemble.GradientBoostingClassifier(**params)
train = clf.fit(X_train_res, y_train_res)
y_pred = clf.predict(X_test)

print('Training set score:', clf.score(X_train_res, y_train_res))
print('\nTest set score:', clf.score(X_test, y_test))
print(metrics.classification_report(y_test, y_pred))
print(metrics.confusion_matrix(y_test, y_pred))

Training set score: 1.0

Test set score: 0.5390625
              precision    recall  f1-score   support

           0       0.23      0.10      0.14        31
           1       0.58      0.83      0.68        69
           2       0.53      0.32      0.40        28

    accuracy                           0.54       128
   macro avg       0.45      0.41      0.41       128
weighted avg       0.49      0.54      0.49       128

[[ 3 24  4]
 [ 8 57  4]
 [ 2 17  9]]


In [97]:
from sklearn.svm import SVC

# Instantiate SVM object
svm = SVC(kernel = 'linear')
train = svm.fit(X_train_res, y_train_res)
y_pred = svm.predict(X_test)

print('Training set score:', svm.score(X_train_res, y_train_res))
print('\nTest set score:', svm.score(X_test, y_test))
print(metrics.classification_report(y_test, y_pred))
print(metrics.confusion_matrix(y_test, y_pred))

Training set score: 0.9430379746835443

Test set score: 0.4765625
              precision    recall  f1-score   support

           0       0.36      0.32      0.34        31
           1       0.58      0.61      0.60        69
           2       0.32      0.32      0.32        28

    accuracy                           0.48       128
   macro avg       0.42      0.42      0.42       128
weighted avg       0.47      0.48      0.47       128

[[10 15  6]
 [14 42 13]
 [ 4 15  9]]


In [None]:
###
"""nlp = spacy.load('en')

corpus = '\n'.join(list(earnings_df['content']))

def word_frequencies(text):
    return Counter(text.split())
    
# The most frequent words:
freq = word_frequencies(corpus).most_common(100)
common_words = [word[0] for word in freq]

def remove_common(text):
    text = ' '.join([word for word in text.split() if word not in common_words])
    return text

earnings_df['redux'] = earnings_df['content'].apply(remove_common)
earnings_df.head()"""