In [None]:
# import required libraries
import pandas as pd
import nltk
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.model_selection import train_test_split

# download NLTK stopwords dictionary
nltk.download('stopwords')

# import regular expression library and clean text library
import re
!pip install cleantext
from cleantext import clean

# import textblob for sentiment analysis
from textblob import TextBlob

# import csv for data handling
import csv

# install and import speech recognition library
!pip install speechrecognition
import speech_recognition as sr

In [None]:
r = sr.Recognizer()

# open the audio file using AudioFile context manager
with sr.AudioFile('sound.wav') as source:
    # record audio from source file
    audio = r.record(source)

# transcribe the audio using Google Speech Recognition API
try:
    text = r.recognize_google(audio)
    print("Google Speech Recognition thinks you said: " + text)
except sr.UnknownValueError:
    print("Google Speech Recognition could not understand audio")
except sr.RequestError as e:
    print("Could not request results from Google Speech Recognition service; {0}".format(e))

# create a new CSV file for writing
with open('output_file.csv', 'w', newline='') as csvfile:
    writer = csv.writer(csvfile)

    # write the header row name
    writer.writerow(['text'])
    
    # split the text into words
    words = text.split()
    while words:
        # join 10 words into a single row & make multiple rows for better application of model
        row = ' '.join(words[:10])
        writer.writerow([row])
        words = words[10:]

In [141]:
# read the CSV file into a pandas DataFrame
data = pd.read_csv('tweets.csv', header=0, encoding='utf-8')

# remove unwanted characters from the 'tweet' column
data['tweet'] = data['tweet'].str.replace('[#,@,&,*,&,,ð,¤,,,,,,,,,,,,,,,,,â,,ï]', '')
print(data)

In [143]:
# define a function to add text polarity to a DataFrame
def add_polarity_to_df(df):
    polarity_list = []
    
    # iterate over each row of the DataFrame
    for x in range(0, df.shape[0]):
        QuantTextBlob = TextBlob(df.iloc[x][2])
        measures = QuantTextBlob.sentiment.polarity
        polarity_list.append(measures)
    
    # add the polarity list as a new column to the DataFrame
    df['Text Polarity'] = polarity_list
    return df

polar = add_polarity_to_df(data)
polar

In [144]:
polar

Unnamed: 0,id,label,tweet,Text Polarity
0,1,0,@user when a father is dysfunctional and is s...,-0.500000
1,2,0,@user @user thanks for #lyft credit i can't us...,0.200000
2,3,0,bihday your majesty,0.000000
3,4,0,#model i love u take with u all the time in ...,0.976562
4,5,0,factsguide: society now #motivation,0.000000
...,...,...,...,...
31957,31958,0,ate @user isz that youuu?ðððððð...,0.000000
31958,31959,0,to see nina turner on the airwaves trying to...,0.400000
31959,31960,0,listening to sad songs on a monday morning otw...,-0.500000
31960,31961,1,"@user #sikh #temple vandalised in in #calgary,...",0.000000


In [145]:
# split the 'data' DataFrame into training and validation sets
train_data, val_data, train_labels, val_labels = train_test_split(data['tweet'], data['label'], test_size=0.2, random_state=42)

# print the first few rows of the 'train_labels' Series
train_labels.head()

12110    1
14081    0
1829     0
2769     0
31818    0
Name: label, dtype: int64

In [146]:
stop_words = nltk.corpus.stopwords.words('english')
tokenizer = nltk.tokenize.RegexpTokenizer(r'\w+')
vectorizer = CountVectorizer(lowercase=True, stop_words=stop_words, tokenizer=tokenizer.tokenize)
train_features = vectorizer.fit_transform(train_data)
val_features = vectorizer.transform(val_data)



In [148]:
# Create a new Multinomial Naive Bayes classifier
clf = MultinomialNB()

# Train the classifier using the training data and labels
clf.fit(train_features, train_labels)

# Print the predicted labels for the training data
print(clf.predict(train_features))

[0 0 0 ... 1 1 0]


In [149]:
val_pred = clf.predict(val_features)
# Calculate various performance metrics based on the predicted and actual labels
accuracy = accuracy_score(val_labels, val_pred)
precision = precision_score(val_labels, val_pred)
recall = recall_score(val_labels, val_pred)
f1 = f1_score(val_labels, val_pred)
cm = confusion_matrix(val_labels, val_pred)

In [150]:
print(f'Accuracy: {accuracy}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1-score: {f1}')
print(f'Confusion matrix: \n{cm}')

Accuracy: 0.9588612544971062
Precision: 0.8560885608856088
Recall: 0.5087719298245614
F1-score: 0.6382393397524071
Confusion matrix: 
[[5898   39]
 [ 224  232]]


In [151]:
test_data = pd.read_csv('output_file.csv', header=0, encoding='utf-8')

In [152]:
test_features = vectorizer.transform(test_data['text'])

In [153]:
test_pred = clf.predict(test_features)

In [154]:
test_data['label'] = test_pred
test_data.to_csv('test_predictions.csv', index=False)


In [155]:
data = pd.read_csv('test_predictions.csv',header=0, encoding='utf-8')

In [156]:
def add_polarity_to_df(df):
    polarity_list = []
    for x in range(0, df.shape[0]):
        QuantTextBlob = TextBlob(df.iloc[x][0])
        measures = QuantTextBlob.sentiment.polarity
        polarity_list.append(measures)
    df['Text Polarity'] = polarity_list
    return df
polars= add_polarity_to_df(data)

In [157]:
polars

Unnamed: 0,text,label,Text Polarity
0,have a child as a tragedy for the family for,0,0.0
1,the world and I was in my forties understood a...,0,0.0
2,childhood that I was sandal on the back in 1950,0,0.0
3,over 20 million children under the age of 5 would,0,0.0
4,die by the year 2000 in about 9.7 million about,0,0.0
5,10% of the children were born since the year 2013,0,0.0
6,unbelievable 5 million of course we would like...,0,-0.25
7,that down to be even lower so where those that's,0,-0.155556
8,what's going on there overwhelmingly it's infe...,1,0.5
9,malaria we can see here that non-communicable ...,0,0.0
