In [83]:
import pandas as pd
import numpy as np
import os
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from pydub import AudioSegment
from pydub.silence import split_on_silence
import speech_recognition as sr
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

In [None]:
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

### Audio Transcription

In [42]:
r = sr.Recognizer()

In [69]:
###creating audio chunks based on the the silence observed and then converting it to the text file
def get_audio_transcript(path):
    lines = []
    # open the audio file using pydub
    sound = AudioSegment.from_wav(path)
    # split audio sound where silence is 700 miliseconds or more and get chunks
    chunks = split_on_silence(sound,
        # experiment with this value for your target audio file
        min_silence_len = 500,
        # adjust this per requirement
        silence_thresh = sound.dBFS-14,
        # keep the silence for 1 second, adjustable as well
        keep_silence=500,
    )
    folder_name = "audio-chunks"
    # create a directory to store the audio chunks

    if not os.path.isdir(folder_name):
        os.mkdir(folder_name)
    whole_text = ""
    # process each chunk
    for i, audio_chunk in enumerate(chunks, start=1):
        # export audio chunk and save it in
        # the `folder_name` directory.
        chunk_filename = os.path.join(folder_name, f"chunk{i}.wav")
        audio_chunk.export(chunk_filename, format="wav")
        # recognize the chunk
        with sr.AudioFile(chunk_filename) as source:
            audio_listened = r.record(source)
            # try converting it to text
            try:
                text = r.recognize_google(audio_listened)
            except sr.UnknownValueError as e:
                continue

            else:
                text = f"{text.capitalize()}. "
                lines.append(text)
    return lines

In [None]:
path = "./Data/audio.wav"
textoupput = get_audio_transcript(path)
#print(*textoupput, sep='\n')


In [77]:
df = pd.DataFrame(textoupput, columns=['Summary'])
df.head()


Unnamed: 0,Summary
0,Maybe i am mad but i am now the proud owner of...
1,Bendy iphone 6.
2,So much bigger than iphone 4s finally got to s...
3,Not revolutionary at all but it's absolutely g...
4,I want one iphone 6.


In [78]:
###Saving the text file to dataset folder which can be used for further processing
df.to_csv('./Data/Transcripted.csv', index=False)

### Loading and pre-processing

In [120]:
df = pd.read_csv('./Data/Transcripted.csv', usecols=['Summary']).astype(str)
print(df.shape)
df.head()

(27, 1)


Unnamed: 0,Summary
0,Maybe i am mad but i am now the proud owner of...
1,Bendy iphone 6.
2,So much bigger than iphone 4s finally got to s...
3,Not revolutionary at all but it's absolutely g...
4,I want one iphone 6.


In [121]:
### creating tokens, part of speech tagging and base words(lemmatization) 
df['tokens']=""
df['pos_tags']=""
df['lemma']=""

In [122]:
for i in range(len(df['Summary'])):
    df['tokens'][i]= word_tokenize(df['Summary'][i])

In [123]:
for i in range(len(df['tokens'])):
    for j in range(len(df['tokens'][i])):
        df['pos_tags'][i] = nltk.pos_tag(df['tokens'][i])

In [124]:
# Lemmatize with POS Tag
def nltk_tag_to_wordnet_tag(nltk_tag):
    if nltk_tag.startswith('J'):
        return wordnet.ADJ
    elif nltk_tag.startswith('V'):
        return wordnet.VERB
    elif nltk_tag.startswith('N'):
        return wordnet.NOUN
    elif nltk_tag.startswith('R'):
        return wordnet.ADV
    else:
        return None

In [125]:
def lemmatize_sentence(sentence):
    wordnet_tagged = map(lambda x: (x[0], nltk_tag_to_wordnet_tag(x[1])), df['pos_tags'][i])
    lemmatized_sentence = []
    for word, tag in wordnet_tagged:
        if tag is None:
            #if there is no available tag, append the token as is
            lemmatized_sentence.append(word)
        else:
            #else use the tag to lemmatize the token
            lemmatized_sentence.append(lemmatizer.lemmatize(word, tag))
    return " ".join(lemmatized_sentence)

In [126]:
lemmatizer = WordNetLemmatizer()
for i in range(len(df['pos_tags'])):
    for j in range(len(df['pos_tags'][i])):
        df['lemma'][i] = lemmatize_sentence(df['pos_tags'][i][j][0])


In [127]:
df.head()

Unnamed: 0,Summary,tokens,pos_tags,lemma
0,Maybe i am mad but i am now the proud owner of...,"[Maybe, i, am, mad, but, i, am, now, the, prou...","[(Maybe, RB), (i, JJ), (am, VBP), (mad, JJ), (...",Maybe i be mad but i be now the proud owner of...
1,Bendy iphone 6.,"[Bendy, iphone, 6, .]","[(Bendy, NNP), (iphone, NN), (6, CD), (., .)]",Bendy iphone 6 .
2,So much bigger than iphone 4s finally got to s...,"[So, much, bigger, than, iphone, 4s, finally, ...","[(So, RB), (much, RB), (bigger, JJR), (than, I...",So much big than iphone 4s finally get to see ...
3,Not revolutionary at all but it's absolutely g...,"[Not, revolutionary, at, all, but, it, 's, abs...","[(Not, RB), (revolutionary, JJ), (at, IN), (al...",Not revolutionary at all but it 's absolutely ...
4,I want one iphone 6.,"[I, want, one, iphone, 6, .]","[(I, PRP), (want, VBP), (one, CD), (iphone, NN...",I want one iphone 6 .


In [128]:
### function to calculate the sentiment score based on the positive and negative values
def sentimental_Score(sentence):
    analyzer = SentimentIntensityAnalyzer()
    vs = analyzer.polarity_scores(sentence)
    score=vs['compound']
    if not vs['pos'] > 0.1:
        if vs['pos'] - vs['neg'] <= 0:
            return 'negative'
    elif not vs['neg'] > 0.1:
        if vs['pos'] - vs['neg'] > 0:
            return 'positive'
    else:
        return 'Neutral'

In [129]:
df['sentiment'] = df['lemma'].apply(lambda line: sentimental_Score(line))

In [130]:
df.head()

Unnamed: 0,Summary,tokens,pos_tags,lemma,sentiment
0,Maybe i am mad but i am now the proud owner of...,"[Maybe, i, am, mad, but, i, am, now, the, prou...","[(Maybe, RB), (i, JJ), (am, VBP), (mad, JJ), (...",Maybe i be mad but i be now the proud owner of...,Neutral
1,Bendy iphone 6.,"[Bendy, iphone, 6, .]","[(Bendy, NNP), (iphone, NN), (6, CD), (., .)]",Bendy iphone 6 .,negative
2,So much bigger than iphone 4s finally got to s...,"[So, much, bigger, than, iphone, 4s, finally, ...","[(So, RB), (much, RB), (bigger, JJR), (than, I...",So much big than iphone 4s finally get to see ...,negative
3,Not revolutionary at all but it's absolutely g...,"[Not, revolutionary, at, all, but, it, 's, abs...","[(Not, RB), (revolutionary, JJ), (at, IN), (al...",Not revolutionary at all but it 's absolutely ...,positive
4,I want one iphone 6.,"[I, want, one, iphone, 6, .]","[(I, PRP), (want, VBP), (one, CD), (iphone, NN...",I want one iphone 6 .,positive


In [137]:
df_1 = df[['Summary','sentiment']]

In [139]:
df_1.head()

Unnamed: 0,Summary,sentiment
0,Maybe i am mad but i am now the proud owner of...,Neutral
1,Bendy iphone 6.,negative
2,So much bigger than iphone 4s finally got to s...,negative
3,Not revolutionary at all but it's absolutely g...,positive
4,I want one iphone 6.,positive


In [140]:
###Adding flags where emotion/sentiment is changing 
df_1.iloc[0,0] = '['+str(df_1.iloc[0,1])+'] '+str(df_1.iloc[0,0])
for i in range(1,len(df_1)):
    if df_1.iloc[i,1] != df_1.iloc[i-1,1]:
        df_1.iloc[i,0] = '['+str(df_1.iloc[i,1])+'] '+str(df_1.iloc[i,0])

In [142]:
###saving the file to Data folder
with open('./Data/output.txt', 'w') as f:
    f.write(df_1['Summary'].str.cat(sep='\n'))