###  Level of Sentiment

In [3]:
import pandas as pd
import numpy as np
import os
import nltk
from nltk.corpus import stopwords
from nltk.corpus import opinion_lexicon
from sklearn.feature_extraction.text import CountVectorizer

In [4]:
def clean_text(text):
    words = text.split()
    words = [word.lower() for word in words if word.isalpha()]
    stop_words = set(stopwords.words('english'))
    cleaned = [word for word in words if word not in stop_words]
    return cleaned 

In [6]:
postive_lexicon = opinion_lexicon.positive()
negative_lexicon = opinion_lexicon.negative()

In [8]:
files = os.listdir('mda/')

In [9]:
#Extract Text from files
files_text = {}
for file in files:
    cik = file.split('_')[0]
    filing_date = file.split('_')[2].split('.')[0]
    cik_filing_date = cik + '_' + filing_date
    with open('mda/' + file) as f:
        text = f.read()
    files_text[cik_filing_date] = text    

In [12]:
#Store files in a DataFrame
#Extract CIK & filing data and convert them into numeric and timestamps format
#Clean columns

df_text = pd.DataFrame(files_text, index=['text']).T
df_text.reset_index(inplace=True)
df_text.columns = ['cik_filing_date','files_text']

df_text['cik'] = df_text['cik_filing_date'].apply(lambda x:x.split('_')[0])
df_text['filing_date'] = df_text['cik_filing_date'].apply(lambda x:x.split('_')[1])

df_text['cik'] = pd.to_numeric(df_text['cik'])
df_text['filing_date'] = pd.to_datetime(df_text['filing_date'], format='%Y-%m-%d')

df_text.drop(columns=['cik_filing_date'], inplace=True)
df_text = df_text[['cik','filing_date','files_text']]

In [13]:
df_text.head()

Unnamed: 0,cik,filing_date,files_text
0,1001082,2005-03-16,CIK: 1001082\nCompany Name: ECHOSTAR COMMUNICA...
1,1001082,2006-03-15,CIK: 1001082\nCompany Name: ECHOSTAR COMMUNICA...
2,1001082,2007-03-01,CIK: 1001082\nCompany Name: ECHOSTAR COMMUNICA...
3,1001082,2008-02-26,CIK: 1001082\nCompany Name: DISH NETWORK CORP\...
4,1001082,2009-03-02,CIK: 1001082\nCompany Name: DISH NETWORK CORP\...


In [14]:
df_text['text_clean'] = df_text['files_text'].apply(clean_text)

In [15]:
df_text.head()

Unnamed: 0,cik,filing_date,files_text,text_clean
0,1001082,2005-03-16,CIK: 1001082\nCompany Name: ECHOSTAR COMMUNICA...,"[company, echostar, communications, corp, fili..."
1,1001082,2006-03-15,CIK: 1001082\nCompany Name: ECHOSTAR COMMUNICA...,"[company, echostar, communications, corp, fili..."
2,1001082,2007-03-01,CIK: 1001082\nCompany Name: ECHOSTAR COMMUNICA...,"[company, echostar, communications, corp, fili..."
3,1001082,2008-02-26,CIK: 1001082\nCompany Name: DISH NETWORK CORP\...,"[company, dish, network, corp, filing, item, m..."
4,1001082,2009-03-02,CIK: 1001082\nCompany Name: DISH NETWORK CORP\...,"[company, dish, network, corp, filing, item, m..."


In [26]:
vectorizer = CountVectorizer(vocabulary=postive_lexicon)

In [27]:
df_text['text_clean_str'] = df_text['text_clean'].apply(lambda x: " ".join(x))
dtm_positive_words = vectorizer.fit_transform(df_text['text_clean_str'])

In [28]:
df_dtm_positive_words = pd.DataFrame(dtm_positive_words.toarray())

In [29]:
df_dtm_positive_words.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1996,1997,1998,1999,2000,2001,2002,2003,2004,2005
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [30]:
df_dtm_positive_words.columns = vectorizer.vocabulary_.keys()

In [31]:
df_text['Positive_count'] = df_dtm_positive_words.sum(axis=1)

In [32]:
df_text.head()

Unnamed: 0,cik,filing_date,files_text,text_clean,text_clean_str,Positive_count
0,1001082,2005-03-16,CIK: 1001082\nCompany Name: ECHOSTAR COMMUNICA...,"[company, echostar, communications, corp, fili...",company echostar communications corp filing it...,36
1,1001082,2006-03-15,CIK: 1001082\nCompany Name: ECHOSTAR COMMUNICA...,"[company, echostar, communications, corp, fili...",company echostar communications corp filing it...,58
2,1001082,2007-03-01,CIK: 1001082\nCompany Name: ECHOSTAR COMMUNICA...,"[company, echostar, communications, corp, fili...",company echostar communications corp filing it...,11
3,1001082,2008-02-26,CIK: 1001082\nCompany Name: DISH NETWORK CORP\...,"[company, dish, network, corp, filing, item, m...",company dish network corp filing item manageme...,45
4,1001082,2009-03-02,CIK: 1001082\nCompany Name: DISH NETWORK CORP\...,"[company, dish, network, corp, filing, item, m...",company dish network corp filing item manageme...,24


In [33]:
vectorizer = CountVectorizer(vocabulary=negative_lexicon)
df_text['text_clean_str'] = df_text['text_clean'].apply(lambda x: " ".join(x))
dtm_negative_words = vectorizer.fit_transform(df_text['text_clean_str'])
df_dtm_negative_words = pd.DataFrame(dtm_negative_words.toarray())
df_dtm_negative_words.columns = vectorizer.vocabulary_.keys()
df_text['Negative_count'] = df_dtm_negative_words.sum(axis=1)

In [34]:
df_text['num_cleaned_words'] = df_text['text_clean'].apply(lambda x: len(x))

In [35]:
df_text['phi_pos'] = df_text['Positive_count'] / df_text['num_cleaned_words']
df_text['phi_neg'] = df_text['Negative_count'] / df_text['num_cleaned_words']

In [36]:
df_text.head()

Unnamed: 0,cik,filing_date,files_text,text_clean,text_clean_str,Positive_count,Negative_count,num_cleaned_words,phi_pos,phi_neg
0,1001082,2005-03-16,CIK: 1001082\nCompany Name: ECHOSTAR COMMUNICA...,"[company, echostar, communications, corp, fili...",company echostar communications corp filing it...,36,33,798,0.045113,0.041353
1,1001082,2006-03-15,CIK: 1001082\nCompany Name: ECHOSTAR COMMUNICA...,"[company, echostar, communications, corp, fili...",company echostar communications corp filing it...,58,47,1037,0.055931,0.045323
2,1001082,2007-03-01,CIK: 1001082\nCompany Name: ECHOSTAR COMMUNICA...,"[company, echostar, communications, corp, fili...",company echostar communications corp filing it...,11,3,314,0.035032,0.009554
3,1001082,2008-02-26,CIK: 1001082\nCompany Name: DISH NETWORK CORP\...,"[company, dish, network, corp, filing, item, m...",company dish network corp filing item manageme...,45,42,766,0.058747,0.05483
4,1001082,2009-03-02,CIK: 1001082\nCompany Name: DISH NETWORK CORP\...,"[company, dish, network, corp, filing, item, m...",company dish network corp filing item manageme...,24,19,437,0.05492,0.043478
