In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, recall_score, precision_score
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error

In [None]:
pd.set_option('display.max_rows', None)
df1 = pd.read_csv("D:/da/result.csv")
df1.head()

In [None]:
df2 = pd.read_csv("D:/da/betasahm1.csv", header = None)
df2.columns = ["Date", "Post"]
df2.head(10)

In [None]:
#extract hashtags world
hashtags = df2.Post.str.findall(r'#.*?(?=\s|$)')


#insert the hashtags column in the data 
df2['hashtags'] = hashtags
df2.head()

In [None]:
#delete empty square brackets
hashtags_list_df = df2.loc[
                       df2.hashtags.apply(
                           lambda hashtags_list: hashtags_list !=[]
                       ),['hashtags']]

In [None]:
#delete NaN row
hashtags_list_df = hashtags_list_df.dropna()
hashtags_list_df.head()

In [None]:
#insert the hashtags column in the data 
df2["hashtags"] = hashtags_list_df
df2.head()

In [None]:
#delete NaN rows 
df = df2.dropna()
df.head()

In [None]:
df = df[['Date', 'hashtags', 'Post']]
df.head()

In [None]:
pd.set_option("max_colwidth", 2)
df[["Date","hashtags", "Post"]].head(10)

In [None]:
foo = lambda a: "/".join(a) 

#merge posts with same hashtags worlds 
df = df.groupby(by=df.hashtags.astype(str)).agg({'Post': foo}).reset_index()
df.head()

In [None]:
#split rows with more than one hashtag word and set post for them

new_df = pd.DataFrame(df["hashtags"].str.split(" ",15).tolist(), index=df["Post"]).stack()
new_df = new_df.reset_index([0, "Post"])

new_df.columns = ["Post", "hashtags"]
new_df = new_df[["hashtags", "Post"]]

new_df.head()

In [None]:
#clean the hashtags column;round1
from parsivar import Normalizer
my_normalizer = Normalizer()

def clean_text_round1(text):
    text = my_normalizer.normalize(str(text))
    return text
round1 = lambda x: clean_text_round1(x) 

In [None]:
#change hashtags columns to str type.
new_df.hashtags = new_df.hashtags.astype(str)

# Apply of cleaning hashtags columns
clean_hashtags1 = pd.DataFrame(new_df.hashtags.apply(round1))
clean_hashtags1

In [None]:
#clean the hashtags column;round2
import re
import string

def clean_text_round2(text):
    text = text.lower()
    text = re.sub('[،]+', '', str(text))
    text = re.sub('[%s]' % re.escape(string.punctuation), '', str(text))
    text = re.sub('\w*\d\w*', '', str(text))
    return text

round2 = lambda x: clean_text_round2(x) 

In [None]:
#change hashtags columns to str type.
clean_hashtags1.hashtags = clean_hashtags1.hashtags.astype(str)

# Apply of clean_text_round2
clean_hashtags2 = pd.DataFrame(clean_hashtags1.hashtags.apply(round2))
clean_hashtags2

In [None]:
clean_hashtags2["Post"] = new_df.Post
data_clean = clean_hashtags2 
data_clean.head(50)

In [None]:
foo = lambda a: "/".join(a) 

#merge posts with same hashtags worlds 
df = data_clean.groupby(by=data_clean.hashtags.astype(str)).agg({'Post': foo}).reset_index()
df.head()

In [None]:
#delete fake hashtags word
df = df.drop([0,1,2,13,18,25])
df.head(54)

In [None]:
#change Post columns to str type.
df.Post = df.Post.astype(str)

# Apply a first round of cleaning
clean_Post1 = pd.DataFrame(df.Post.apply(round1))
clean_Post1.head()

In [None]:
# Apply a secound round of cleaning
import re
import string

def clean_text_round2(text):
    text = text.lower()
    text = re.sub('[،]+', '', str(text))
    text = re.sub('\[.*?\]', '', str(text))
    text = re.sub('[%s]' % re.escape(string.punctuation), '', str(text))
    text = re.sub('\w*\d\w*', '', str(text))
    return text

round2 = lambda x: clean_text_round2(x) 

In [None]:
#change Post columns to str type.
clean_Post1.Post = clean_Post1.Post.astype(str)

# Apply a first round of cleaning
clean_Post2 = pd.DataFrame(clean_Post1.Post.apply(round2))
clean_Post2.head()

In [None]:
# Apply a second round of cleaning
def clean_text_round3(text):
    '''Get rid of some additional punctuation and non-sensical text that was missed the first time around.'''
    text = re.sub('[‘’“”…]', '', text)
    text = re.sub('\n', '', text)
    return text

round3 = lambda x: clean_text_round3(x)

In [None]:
#change Post columns to str type.
clean_Post2.Post = clean_Post2.Post.astype(str)

# Apply a first round of cleaning
clean_Post3 = pd.DataFrame(clean_Post2.Post.apply(round3))
clean_Post3.head()

In [None]:
clean_Post3["hashtags"] = df.hashtags

In [None]:
#138, 368, 369, 370 (194,195), (224,225), (291,292), (302, 303), (389,390), (445,446), (476,477), 534, (582,583), (638,639), (717,718)
df = clean_Post3[['hashtags', 'Post']]
df.head()

In [None]:
# Let's pickle it for later use
df.to_pickle("corpus.pkl")

In [None]:
# We are going to create a document-term matrix using CountVectorizer
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer()
data_cv = cv.fit_transform(df.Post)
data_dtm = pd.DataFrame(data_cv.toarray(), columns=cv.get_feature_names())
data_dtm.index = df.hashtags
data_dtm.head()

In [None]:
# Let's pickle it for later use
data_dtm.to_pickle("dtm.pkl")

In [None]:
# Let's also pickle the cleaned data (before we put it in document-term matrix format) and the CountVectorizer object

import pickle
df.to_pickle('data_clean.pkl')
pickle.dump(cv, open("cv.pkl", "wb"))