### Imports and additional library installs

In [None]:
!pip install nltk
!pip install wordcloud
!pip install boto3

In [None]:
import pyspark
import os

import re
import nltk
nltk.download('stopwords')
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from collections import Counter
from wordcloud import WordCloud

### File Import path

In [None]:
#Set the configuration
conf = pyspark.SparkConf().setAppName('Text Analysis').setMaster('spark://' + os.environ['SPARK_CLUSTER'] + ':7077')

#Set the Spark cluster connection
sc = pyspark.SparkContext.getOrCreate(conf)

accessKey= os.environ['AWS_ACCESS_KEY_ID']
secretKey= os.environ['AWS_SECRET_ACCESS_KEY']
endpointUrl= os.environ['S3_ENDPOINT_URL']
s3Bucket= os.environ['S3BUCKET']

#Set the Hadoop configurations to access Ceph S3
sc._jsc.hadoopConfiguration().set("fs.s3a.access.key", accessKey) 
sc._jsc.hadoopConfiguration().set("fs.s3a.secret.key", secretKey) 
sc._jsc.hadoopConfiguration().set("fs.s3a.endpoint", endpointUrl) 

#Get the SQL context
sqlContext = pyspark.SQLContext(sc)

feedbackFile = sqlContext.read.option("sep", "\t").csv("s3a://" + s3Bucket + "/datasets/sample_text_data.tsv", header=True)

feedbackFile.show()

In [None]:
feedbackFile.printSchema()

PATH = './Responses_1.tsv'

### Convert data to a Pandas dataframe and create dataset to export

In [None]:
df = feedbackFile.toPandas()

feedbackFile.registerTempTable("analysis")
outputDf = sqlContext.sql("SELECT * FROM analysis WHERE Outcome == 'Successful'")

outputDf.show()

### Store output file in Ceph S3

In [None]:
#This file contains only the filtered data marked as "Successful".
#However any filtering could be done, along with joining other datasets.

outputDf.write.option("sep", "\t").csv("s3a://" + s3Bucket + "/output/filtered_text_data.tsv", header=True, mode="overwrite")

In [None]:
sc.stop()
df.head()

In [None]:

np.random.seed(sum(map(ord, "categorical")))

In [None]:
outcome_dict = {'Successful':0,'Partial Success':1,'Unsuccessful':2 }

In [None]:
df_vis = df[['Your Name', 'Outcome']]
df_vis['outcome_numeric'] = df_vis['Outcome'].apply(lambda a:outcome_dict[a])



In [None]:
outcome_cross_table = pd.crosstab(index=df_vis["Your Name"], 
                          columns=df_vis["Outcome"])


In [None]:
outcome_cross_table.plot(kind="bar", 
                 figsize=(16,12),
                 stacked=True,fontsize=12)
plt.show();

In [None]:
event_type_cross_table = pd.crosstab(index=df["Primary Audience Engaged"], 
                          columns=df["Outcome"])

In [None]:
event_type_cross_table.plot(kind="bar", 
                 figsize=(16,12),
                 stacked=True,fontsize=12)
plt.show();

### Count each column entry numbers

In [None]:
df.count()

In [None]:
len(df['Your Name'].unique())

### Unique names

In [None]:
df['Your Name'].unique()

In [None]:
df['Trip Name'].unique()

### Convert "Highlights" to string from object

In [None]:
df['Highlights'].head()

In [None]:
df['Highlights'] = df['Highlights'].astype(str)

### Joining all the "Highlights" for visualization

In [None]:
joined_Highlights = " ".join(df.Highlights)

In [None]:
joined_Highlights

### Keeping all the links together, mentioned in the reports

In [None]:

links = re.findall(r'(https?://[^\s]+)', joined_Highlights)
for link in links:
    print (link)

### Remove urls from actual text

In [None]:
def rm_url(input_string):
    return re.sub(r'^https?:\/\/.*[\r\n]*', '', input_string, flags=re.MULTILINE)

In [None]:
joined_Highlights = rm_url(joined_Highlights)

### Removing stopwords

In [None]:
def rm_stopwords(input_string):
    return " ".join(list(filter(lambda a : a not in(set(nltk.corpus.stopwords.words('english'))), input_string.lower().split(" "))))

In [None]:
joined_Highlights = rm_stopwords(joined_Highlights)

### Wordcloud plot method

In [None]:
def plot_wc(input_string):
    wordcloud = WordCloud(background_color='black',
                          width=1200,
                          height=1000
                         ).generate(input_string)
    plt.figure(figsize=(30,16))
    plt.imshow(wordcloud)
    plt.axis('off')
    plt.show()
    

#### Keyword of choice

In [None]:
key_word = ['container', 'kubernetes','rhel','openstack','openshift','ansible','cloud','integration', 'product','deployment','satelite', 'security', 'compliance']

In [None]:
counted = Counter(joined_Highlights.split(' '))

In [None]:
counted.most_common()

### Word cloud with total data

In [None]:
plot_wc(joined_Highlights)

### Analysing by "Outcome"

In [None]:
df[['Highlights','Outcome']].head(20)

In [None]:
df_outcome = df[['Highlights','Outcome']]

### Plotting WordCloud by "outcome" grouping

In [None]:
grouped_highlights = pd.DataFrame(df_outcome.groupby('Outcome')['Highlights'].apply(lambda x: "%s" % ' '.join(x)))

In [None]:
grouped_highlights

### WordCloud for successful outcomes

In [None]:
grouped_highlights['Outcome'] = list(grouped_highlights.index.get_values())
grouped_highlights.reset_index(drop=True, inplace=True)

In [None]:
grouped_highlights['Highlights'] = grouped_highlights['Highlights'].astype(str)

In [None]:
text = " ".join(grouped_highlights[grouped_highlights['Outcome'] == "Successful"]["Highlights"])

In [None]:
plot_wc(rm_stopwords(rm_url(text)))