**Import New Dataset**

In [1]:
#load the libraries
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt
%matplotlib inline
import re 
import nltk
from nltk.corpus import stopwords
from wordcloud import WordCloud,STOPWORDS
import warnings
warnings.filterwarnings('ignore')

In [2]:
raw_data = pd.read_csv("../input/data-science-assignments/TEXT MINING/Elon_musk.csv",encoding='cp1252').drop('Unnamed: 0',axis=1)
df = raw_data.copy()
df #There are 1999 tweets

In [3]:
df.info() 

In [4]:
df.Text = df.Text.astype(str)

**Preprocessing data**

We cannot remove stopwords because tweets mostly contain small words.

In [5]:
# Clean The Data using RegEx
#result = re.sub(pattern, repl, string, count=0, flags=0)

def cleantext(text):
    text = re.sub(r"@[A-Za-z0-9]+", "", text) # Remove Mentions
    text = re.sub(r"#", "", text) # Remove Hashtags Symbol
    text = re.sub(r"RT[\s]+", "", text) # Remove Retweets
    text = re.sub(r"https?:\/\/\S+", "", text) # Remove The Hyper Link
    text = re.sub(r"_[A-Za-z0-9]", "", text)#Removing underscores

    
    return text
# Clean The Text
df["Text"] = df["Text"].apply(cleantext)
df.head()

In [6]:
df.tail()

**Tweet Sentiment Values**

In [7]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer
sentialz = SentimentIntensityAnalyzer()

In [8]:
sentialz.polarity_scores(df.Text.iloc[2])

In [9]:
#Function to retrieve sentiment class
def get_senti_class(review): 
    analysis = sentialz.polarity_scores(review)
    # set sentiment 
    if analysis["compound"] > 0: 
        return 'positive'
    elif analysis["compound"] < 0: 
        return 'negative'
    else: 
        return 'neutral'
    
def get_sentiment(review):
    return pd.Series([sentialz.polarity_scores(review),get_senti_class(review)])

In [10]:
#Applying function to all tweets
df[["Scores", "Sentiment"]] = df.Text.apply(get_sentiment)
df

In [11]:
comp_values = [d.get('compound') for d in df.Scores]
df.Sentiment.value_counts()

In [12]:
sns.countplot(df.Sentiment,palette='cool_r') #There are very less negative comments

In [13]:
sns.lineplot(data = comp_values, palette='cool_r') #Most are positive/Neutral tweets

In [14]:
df.loc[df.Sentiment=='negative']

**Word Cloud**

In [21]:
tweet_str = ' '.join(df['Text'].tolist()) #Since tweets are filled with little words, we cant remove stopwords

In [22]:
#Creating a WordCloud
from matplotlib.pyplot import figure
font_path = '../input/metropolis/metropolis.regular.otf'
wordcloud_ip = WordCloud(
                      background_color="white", max_words=500,font_path=font_path,
               stopwords=STOPWORDS, max_font_size=200,
               random_state=42).generate(tweet_str)
figure(figsize=(2,3), dpi=300)
plt.imshow(wordcloud_ip,interpolation="bilinear")
plt.axis("off")
plt.show()

**Elon-Musk**

In [25]:
from PIL import Image
maskArray = np.array(Image.open("../input/elonkd/elon.png"))
elon_mask = WordCloud(background_color='black',
                      max_words = 60, 
                    stopwords = set(STOPWORDS),
                    colormap='Purples_r',
                    mask = maskArray,
                    font_path=font_path).generate(tweet_str)

figure(figsize=(3,4), dpi=300)
plt.imshow(elon_mask,interpolation="bilinear")
plt.axis("off")
plt.show()