In [None]:
# Read in the document-term matrix
import pandas as pd

data = pd.read_pickle('dtm.pkl')
data = data.transpose()
data.head()

In [None]:
# Find the top 30 words each companies
top_dict = {}

for c in data.columns:
    top = data[c].sort_values(ascending = False).head(30)
    top_dict[c] = list(zip(top.index, top.values))

top_dict

In [None]:
# Print the top 15 words said by each companies
for companies, top_words in top_dict.items():
    print(companies)
    print(','.join([word for word, count in top_words[0:14]]))
    print('---')

In [None]:
# Look at the most common top words --> add them to the stop word list
from collections import Counter

# Let's first pull out the top 30 words for each companies
words = []
for companies in data.columns:
    top = [word for (word, count) in top_dict[companies]]
    for t in top:
        words.append(t)
        
words



In [None]:
# Let's aggregate this list and identify the most common words
Counter(words).most_common()

In [None]:
# If more than half of the companies have it as a top word, exclude it from the list
add_stop_words = [word for word, count in Counter(words).most_common() if count > 500]
add_stop_words

In [None]:
stop_words = ['را', 'با', 'است', 'می', 'که', 'این', 'از', 'به', 'در', 'و', 'های', 'برای', 'آن', 'یک', 'ها',
             'شود', 'شده', 'خود', 'کرد', 'ای', 'کرده', 'داشته', 'بوده', 'بود', 'زده', 'تا', 'هر', 'هم', 'نیز',
             'خواهد', 'شد', 'بر', 'دارد', 'زد', 'آن', 'یا', 'باشد', 'میگردد', 'ای' ]

In [None]:
# Let's update our document-term matrix with the new list of stop words

from sklearn.feature_extraction import text 
from sklearn.feature_extraction.text import CountVectorizer

# Read in cleaned data
data_clean = pd.read_pickle('data_clean.pkl')

# Add new stop words
stop_words = add_stop_words + stop_words

# Recreate document-term matrix
cv = CountVectorizer(stop_words=stop_words)
data_cv = cv.fit_transform(data_clean.Post)
data_stop = pd.DataFrame(data_cv.toarray(), columns=cv.get_feature_names())
data_stop.index = data_clean.hashtags

# Pickle it for later use
import pickle
pickle.dump(cv, open("cv_stop.pkl", "wb"))
data_stop.to_pickle("dtm_stop.pkl")

In [None]:
# Find the number of unique words that each companies uses

unique_list = []
for companies in data.columns:
    uniques = data[companies].nonzero()[0].size
    unique_list.append(uniques)
data_words = pd.DataFrame(list(zip(data_clean.hashtags, unique_list)), columns=['companies',"unique_words"])
data_unique_sort = data_words.sort_values(by= "unique_words")
data_unique_sort

In [None]:
#find total number of words which a companies uses

total_list = []

for companies in data.columns:
    totals = sum(data[companies])
    total_list.append(totals)

data_words['total_words'] = total_list

data_wpm_sort = data_words.sort_values(by ='total_words' )
data_wpm_sort

In [None]:
data_wpm_sort =data_wpm_sort[data_wpm_sort['unique_words'] != data_wpm_sort['total_words']]
data_wpm_sort

In [None]:
corrmat = data_wpm_sort.drop(labels=['companies'], axis=1)
corrmat

In [None]:
correlated_data = corrmat.corr()
correlated_data 

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

In [None]:
sns.pairplot(corrmat)
plt.tight_layout()

In [None]:
Counter(words).most_common()

In [None]:
# number of negative words for every companies
data_bad_words = data.transpose()[['زیان' ,'افت' ,'کاهش' ,'تعلیق' ,'منفی' ,'بدهی']]


data_bw = pd.concat([data_bad_words.زیان + data_bad_words.افت +
                     data_bad_words.کاهش + data_bad_words.تعلیق +
                     data_bad_words.منفی + data_bad_words.بدهی]) 

data_bw = pd.DataFrame(data_bw, columns = ['bad_words'])
data_bw

In [None]:
# number of positive words for every companies
data_good_words = data.transpose()[['سود','افزایش','رشد','مثبت','پیشرفت','بهبود']]

data_gw = pd.concat([data_good_words.پیشرفت + data_good_words.رشد +
                     data_good_words.سود + data_good_words.افزایش +
                     data_good_words.مثبت + data_good_words.بهبود])


data_gw = pd.DataFrame(data_gw, columns = ['good_words'])
data_gw

In [None]:
data_gw['bad_words'] = data_bw.bad_words

In [None]:
data_gbw = data_gw.sort_values(by ='good_words', ascending = False)
data_gbw

In [None]:
#remove rows with zero good and bad words
data_gbw =data_gbw[data_gbw['good_words'] != data_gbw['bad_words']]
data_gbw

In [None]:
data_gbw.index.name = ''
data_gbw

In [None]:
data_gbw.describe()

In [None]:
correlated_data = data_gbw.corr()
correlated_data 

In [None]:
sns.pairplot(data_gbw)
plt.tight_layout()

In [None]:
plt.rcParams['figure.figsize'] = [15, 10]

for i, companies in enumerate(data_gbw.index):
    x = data_gbw.good_words.loc[companies]
    y = data_gbw.bad_words.loc[companies]
    plt.scatter(x, y, color = "blue")
    #plt.text(x+1.5, y+0.5, data_gbw.index[i], fontsize = 10 )
    #plt.xlim(-5, 155)
    
plt.title('Number of Bad and Good Words')
plt.xlabel('Number of Good Words', fontsize =15)
plt.ylabel('Number of Bad Words', fontsize=15);
plt.show()

In [None]:
plt.rcParams['figure.figsize'] = [15, 10]

for i, companies in enumerate(data_gbw.index):
    x = data_gbw.good_words.loc[companies]
    y = data_gbw.bad_words.loc[companies]
    plt.scatter(x, y, color = "blue")
    plt.text(x+1.5, y+0.5, data_gbw.index[i], fontsize = 10 )
    plt.xlim(-5, 155)
    
plt.title('Number of Bad and Good Words')
plt.xlabel('Number of Good Words', fontsize =15)
plt.ylabel('Number of Bad Words', fontsize=15);
plt.show()

In [None]:
mean_gbw = data_gbw.mean(axis=1) 

In [None]:
data_gbw['mean'] = mean_gbw 

In [None]:
#status of every companies based on positive and negative wo
data_gbw = data_gbw.sort_values(by ='mean', ascending = False)
data_gbw.head(15)

In [None]:
data_clean = pd.read_pickle('data_clean.pkl')
data_clean

In [None]:
selected_companies = ['وبملت', 'خودرو', 'وتجارت', 'فولاد', 'وبصادر', 'ونوین', 'فملی', 'کچاد', 'وپارس', 'کگل', 'خساپا',
                      'شیران', 'چکارن', 'فارس', 'وپاسار']

In [None]:
selected_companies = data_clean.loc[data_clean['hashtags'].isin(selected_companies)]

In [None]:
#15 Top companies
selected_companies.head(15)

In [None]:
stop_words = ['را', 'با', 'است', 'می', 'که', 'این', 'از', 'به', 'در', 'و', 'های', 'برای', 'آن', 'یک', 'ها',
             'شود', 'شده', 'خود', 'کرد', 'ای', 'کرده', 'داشته', 'بوده', 'بود', 'زده', 'تا', 'هر', 'هم', 'نیز',
             'خواهد', 'شد', 'بر', 'دارد', 'زد', 'آن', 'یا', 'باشد', 'میگردد', 'ای' ]

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

cv =  CountVectorizer(stop_words = stop_words)
data_cv = cv.fit_transform(selected_companies.Post)
data_dtm = pd.DataFrame(data_cv.toarray(), columns = cv.get_feature_names())
data_dtm.index = selected_companies.hashtags

In [None]:
data = data_dtm.transpose()
data.head()

In [None]:
top_dict = {}

for c in data.columns:
    top = data[c].sort_values(ascending = False).head(30)
    top_dict[c] = list(zip(top.index, top.values))

top_dict

In [None]:
# Look at the most common top words --> add them to the stop word list
from collections import Counter

# Let's first pull out the top 30 words for each companies
words = []
for companies in data.columns:
    top = [word for (word, count) in top_dict[companies]]
    for t in top:
        words.append(t)
        
words

In [None]:
# Let's aggregate this list and identify the most common words
Counter(words).most_common()

In [None]:
# If more than half of the comedians have it as a top word, exclude it from the list
add_stop_words = [word for word, count in Counter(words).most_common() if count > 6]
add_stop_words

In [None]:
stop_words = ['را', 'با', 'است', 'می', 'که', 'این', 'از', 'به', 'در', 'و', 'های', 'برای', 'آن', 'یک', 'ها',
             'شود', 'شده', 'خود', 'کرد', 'ای', 'کرده', 'داشته', 'بوده', 'بود', 'زده', 'تا', 'هر', 'هم', 'نیز',
             'خواهد', 'شد', 'بر', 'دارد', 'زد', 'آن', 'یا', 'باشد', 'میگردد', 'ای' ]

In [None]:
# Let's update our document-term matrix with the new list of stop words
from sklearn.feature_extraction import text 
from sklearn.feature_extraction.text import CountVectorizer


# Add new stop words
stop_words = add_stop_words + stop_words

# Recreate document-term matrix
cv = CountVectorizer(stop_words=stop_words)
data_cv = cv.fit_transform(selected_companies.Post)
data_stop = pd.DataFrame(data_cv.toarray(), columns=cv.get_feature_names())
data_stop.index = selected_companies.hashtags

# Pickle it for later use
import pickle
pickle.dump(cv, open("cvs_stop.pkl", "wb"))
data_stop.to_pickle("dtms_stop.pkl")

In [None]:
# Find the number of unique words that each companies uses

unique_list = []
for companies in data.columns:
    uniques = data[companies].nonzero()[0].size
    unique_list.append(uniques)
data_words = pd.DataFrame(list(zip(selected_companies.hashtags, unique_list)), columns=['companies',"unique_words"])
data_unique_sort = data_words.sort_values(by= "unique_words")
data_unique_sort

In [None]:
#find total number of words which a companies uses

total_list = []

for companies in data.columns:
    totals = sum(data[companies])
    total_list.append(totals)

data_words['total_words'] = total_list

data_wpm_sort = data_words.sort_values(by ='total_words' )
data_wpm_sort

In [None]:
Counter(words).most_common()

In [None]:
# number of negative words for every companies
data_bad_words = data.transpose()[['زیان' ,'افت' ,'کاهش' ,'تعلیق' ,'منفی' ,'بدهی']]


data_bw = pd.concat([data_bad_words.زیان + data_bad_words.افت +
                     data_bad_words.کاهش + data_bad_words.تعلیق +
                     data_bad_words.منفی + data_bad_words.بدهی]) 

data_bw = pd.DataFrame(data_bw, columns = ['bad_words'])
data_bw

In [None]:
# number of positive words for every companies
data_good_words = data.transpose()[['سود','افزایش','رشد','مثبت','پیشرفت','بهبود']]

data_gw = pd.concat([data_good_words.پیشرفت + data_good_words.رشد +
                     data_good_words.سود + data_good_words.افزایش +
                     data_good_words.مثبت + data_good_words.بهبود])


data_gw = pd.DataFrame(data_gw, columns = ['good_words'])
data_gw

In [None]:
data_gw['bad_words'] = data_bw.bad_words

In [None]:
data_gbw = data_gw.sort_values(by ='good_words', ascending = False)
data_gbw

In [None]:
plt.rcParams['figure.figsize'] = [15, 10]

for i, companies in enumerate(data_gbw.index):
    x = data_gbw.good_words.loc[companies]
    y = data_gbw.bad_words.loc[companies]
    plt.scatter(x, y, color = "blue")
    #plt.text(x+1.5, y+0.5, data_gbw.index[i], fontsize = 10 )
    #plt.xlim(-5, 155)
    
plt.title('Number of Bad and Good Words')
plt.xlabel('Number of Good Words', fontsize =15)
plt.ylabel('Number of Bad Words', fontsize=15);
plt.show()

In [None]:
# Let's make some word clouds!
# Terminal / Anaconda Prompt: conda install -c conda-forge wordcloud
from wordcloud import WordCloud

wc = WordCloud(stopwords=stop_words, background_color="white", colormap="Dark2",
               max_font_size=150, random_state=42)

In [None]:
# Reset the output dimensions
from __future__ import unicode_literals
import matplotlib.pyplot as plt

plt.rcParams['figure.figsize'] = [16, 6]

company_names = ['وبملت', 'خودرو', 'وتجارت', 'فولاد', 'وبصادر', 'ونوین', 'فملی', 'کچاد', 'وپارس', 'کگل', 'خساپا',
                 'شیران', 'چکارن', 'فارس', 'وپاسار']

# Create subplots for each companies
for index, companies in enumerate(data.columns):
    wc.generate(str(selected_companies.Post))
    
    plt.subplot(3, 4, index+1)
    plt.imshow(wc, interpolation="bilinear")
    plt.axis("off")
    plt.title(company_names[index])
    
plt.show()