In [None]:
!apt install chromium-chromedriver
!cp /usr/lib/chromium-browser/chromedriver /usr/bin
!pip install selenium

In [None]:
from selenium import webdriver
options = webdriver.ChromeOptions()

In [None]:
options.add_argument('--headless')
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')

In [None]:
driver = webdriver.Chrome(options=options)

In [None]:
driver.maximize_window()

In [None]:
import pandas as pd
import numpy as np
from selenium import webdriver
from selenium.webdriver.common.by import By

### Reading Link File

In [None]:
links = pd.read_excel('Input.xlsx')
links.head()

In [None]:
for  index,row in links.head(2).iterrows():
  print(row['URL_ID'],row['URL'])

### Now we will scrape data from the data given in these links

In [None]:
# function to scrape data from the links
def scrape_data(link):
  global driver
  driver.get(link)
  title=driver.find_element(By.XPATH,"//div[contains(@class,'td-post-content')]")
  driver.implicitly_wait(10)
  return title.text

In [None]:
# function to save the scraped files
def save_file(scrapdata):
  for data in scrapdata:
    fname=str(data['URL-ID'])+".txt"
    f=open("./sample_data/scraped_files/"+fname,'w+',encoding='utf-8')
    f.write(data['TEXT'])
    f.close()

### Perfroming Scraping operation

In [None]:
data=[]
for index,row in links.iterrows():
  item={}
  item['URL-ID']=row['URL_ID']
  item['TEXT']=scrape_data(row['URL'])
  data.append(item)
save_file(data)
       

### Making a data frame of scrapped data

In [None]:
df=pd.DataFrame(data)

In [None]:
df.head(5)

In [None]:
# Save all data in one csv file
df.to_csv('contect.csv',index=None)

### let us do some pre processing of the data before we perform sentiment  analysis on it

In [None]:
df["Number of sentences"]=df['TEXT'].apply(lambda x:len(x.split('.')))

In [None]:
df.head(5)

### Replacing short form of words

In [None]:
def short_forms():    
    return {
        "cant":"can not",
        "dont":"do not",
        "wont":"will not",
        "ain't":"is not",
        "amn't":"am not",
        "aren't":"are not",
        "can't":"cannot",
        "'cause":"because",
        "couldn't":"could not",
        "couldn't've":"could not have",
        "could've":"could have",
        "daren't":"dare not",
        "daresn't":"dare not",
        "dasn't":"dare not",
        "didn't":"did not",
        "doesn't":"does not",
        "don't":"do not",
        "e'er":"ever",
        "em":"them",
        "everyone's":"everyone is",
        "finna":"fixing to",
        "gimme":"give me",
        "gonna":"going to",
        "gon't":"go not",
        "gotta":"got to",
        "hadn't":"had not",
        "hasn't":"has not",
        "haven't":"have not",
        "he'd":"he would",
        "he'll":"he will",
        "he's":"he is",
        "he've":"he have",
        "how'd":"how would",
        "how'll":"how will",
        "how're":"how are",
        "how's":"how is",
        "I'd":"I would",
        "I'll":"I will",
        "I'm":"I am",
        "I'm'a":"I am about to",
        "I'm'o":"I am going to",
        "isn't":"is not",
        "it'd":"it would",
        "it'll":"it will",
        "it's":"it is",
        "I've":"I have",
        "kinda":"kind of",
        "let's":"let us",
        "mayn't":"may not",
        "may've":"may have",
        "mightn't":"might not",
        "might've":"might have",
        "mustn't":"must not",
        "mustn't've":"must not have",
        "must've":"must have",
        "needn't":"need not",
        "ne'er":"never",
        "o'":"of",
        "o'er":"over",
        "ol'":"old",
        "oughtn't":"ought not",
        "shalln't":"shall not",
        "shan't":"shall not",
        "she'd":"she would",
        "she'll":"she will",
        "she's":"she is",
        "shouldn't":"should not",
        "shouldn't've":"should not have",
        "should've":"should have",
        "somebody's":"somebody is",
        "someone's":"someone is",
        "something's":"something is",
        "that'd":"that would",
        "that'll":"that will",
        "that're":"that are",
        "that's":"that is",
        "there'd":"there would",
        "there'll":"there will",
        "there're":"there are",
        "there's":"there is",
        "these're":"these are",
        "they'd":"they would",
        "they'll":"they will",
        "they're":"they are",
        "they've":"they have",
        "this's":"this is",
        "those're":"those are",
        "'tis":"it is",
        "'twas":"it was",
        "wanna":"want to",
        "wasn't":"was not",
        "we'd":"we would",
        "we'd've":"we would have",
        "we'll":"we will",
        "we're":"we are",
        "weren't":"were not",
        "we've":"we have",
        "what'd":"what did",
        "what'll":"what will",
        "what're":"what are",
        "what's":"what is",
        "what've":"what have",
        "when's":"when is",
        "where'd":"where did",
        "where're":"where are",
        "where's":"where is",
        "where've":"where have",
        "which's":"which is",
        "who'd":"who would",
        "who'd've":"who would have",
        "who'll":"who will",
        "who're":"who are",
        "who's":"who is",
        "who've":"who have",
        "why'd":"why did",
        "why're":"why are",
        "why's":"why is",
        "won't":"will not",
        "wouldn't":"would not",
        "would've":"would have",
        "y'all":"you all",
        "you'd":"you would",
        "you'll":"you will",
        "you're":"you are",
        "you've":"you have",
        "Whatcha":"What are you",
        "luv":"love",
        "sux":"sucks",
        "couldn't":"could not",
        "wouldn't":"would not",
        "shouldn't":"should not",
        "im":"i am"
        }

In [None]:
import re  ##check if a particular string matches a given regular expression
import string

## funtion to replace the short forms 
def normalization(data):
    data = str(data).lower()
    # URL
    data = re.sub('((www.[^\s]+)|(https?://[^\s]+))',' ',data)
    data = re.sub(r'#([^\s]+)', r'\1', data)

    # Number
    data = ''.join([i for i in data if not i.isdigit()])

    # Punctuation

    for sym in string.punctuation:
        data = data.replace(sym, " ")
    short_form = short_forms()
    data = data.replace("’","'")
    words = data.split()
    converted = [short_form[word] if word in short_form else word for word in words]
    data = " ".join(converted)
    return data

In [None]:
df['TEXT']=df['TEXT'].apply(normalization)

In [None]:
df['TEXT']=df['TEXT'].apply(lambda x:x.lower())

In [None]:
df.head()

### Performing Sentiment Analysis

In [None]:
# LoughranMcDonald_MasterDictionary_2020 is a dictionary which contains the 
#sentiment analysis words which will act as a reference for our data set words

guide=pd.read_csv('LoughranMcDonald_MasterDictionary_2020.csv')
guide.head()

In [None]:
guide[guide['Negative']>0]['Word']

### Assigning Positive and Negative score to our words based on the dictionary words

In [None]:
pos = [] 
neg =[]
Uncertain = []
for index,row in guide.iterrows():
    if row['Negative']>0:
        neg.append(row['Word'].lower())
    elif row['Positive']>0:
        pos.append(row['Word'].lower())
    elif row['Uncertainty']>0:
        Uncertain.append(row['Word'].lower())

In [None]:
df.head()

In [None]:
def positivescore(text):
  score=0
  global pos
  words=text.split()
  for word in words:
    if word in pos:
      score+=1
  return score

In [None]:
def negativescore(text):
  score=0
  global neg
  words=text.split()
  for word in words:
    if word in neg:
      score -= 1
  return score

In [None]:
df['PositiveScore']=df['TEXT'].apply(positivescore)
df['NegativeScore']=df['TEXT'].apply(negativescore)

### Getting all the different parameters

In [None]:
df['POLARITY SCORE']=(df['PositiveScore']-df['NegativeScore'])/ ((df['PositiveScore'] + df['NegativeScore']) + 0.000001)
df['WORD COUNT']=df['TEXT'].apply(lambda x:len(x.split()))
df['SUBJECTIVITY SCORE']=(df['PositiveScore'] + df['NegativeScore'])/ ((df['WORD COUNT']) + 0.000001)
df['AVG SENTENCE LENGTH']=df['WORD COUNT']/df['Number of sentences']
df['AVG NUMBER OF WORDS PER SENTENCE'] = df['WORD COUNT']/df['Number of sentences']

In [None]:
## for avg length of words
def avgwordlength(text):
    words = text.split()
    no_of_words=len(words)
    total_char=0
    for word in words:
        total_char+=len(word)
    return total_char/no_of_words

In [None]:
## for seeing if the sentence has pronoun
def pronoun(text):
    pronouns = r"(\b(s?i|me|we|my|ours|us|I|Me|We|My|Ours|Us)\b)"
    result = 0

    matches = re.finditer(pronouns,text,re.MULTILINE)
    for nummatch,match in enumerate(matches):
        result+=1
    return result

In [None]:
df['AVG WORD LENGTH']=df['TEXT'].apply(avgwordlength)
df['AVG SENTENCE LENGTH']=df['WORD COUNT']/df['Number of sentences']
df['PERSONAL PRONOUNS']=df['TEXT'].apply(pronoun)

In [None]:
df[df['PositiveScore']>0]

In [None]:
df['URL']=links['URL']

In [None]:
df.head()