In [9]:
!pip install SpeechRecognition



In [10]:
import nltk
import pandas as pd
import re              #package for importing regular expression
import numpy as np
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import speech_recognition as sr
from sklearn.model_selection import train_test_split
nltk.download('stopwords')
nltk.download('wordnet')
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics



[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\91704\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\91704\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [11]:
def audio_to_text():
    conv=[]
    recognizer = sr.Recognizer()
    with sr.Microphone() as source:
        print("Listening...")
        audio = recognizer.listen(source)
        print("Listening stopped")
    try:  
        text = recognizer.recognize_google(audio)
        conv.append(text)
    except sr.UnknownValueError:
        print("Sorry, I could not understand.")
    return conv


In [12]:
# Method to preprocess the data
def preprocessing(dataset,num_of_rows=1):
    stemmer =WordNetLemmatizer()
    corpus=[]
    #nltk.download('wordnet')
    for i in range(0,num_of_rows):
        #Removing words which are special character
        document=re.sub(r'\W',' ',dataset[i])
        
    
        #Removing single characters from the document
        document=re.sub(r'\s+[a-zA-Z]\s+',' ',document)
    
        #Removing single character from start
        document=re.sub(r'\^[a-zA-Z]\s+',' ',document)
    
        #Removing one or more spaces and replacing by one space
        document=re.sub(r'\s+',' ',document,flags=re.I)
    
        document=document.lower()
    
        document=document.split()
        document=[stemmer.lemmatize(w) for w in document]
        document=' '.join(document)
    
        #Now adding it to our corpus
        corpus.append(document)
    return corpus

In [13]:
# Method to train model 1
def predict_using_count_vectoriser(dataset,num_of_rows,callData):
    #Now we will divide the data for train and test our dataset
    x=preprocessing(dataset['call_content'],num_of_rows)
    y=dataset.Label   
    x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.3,random_state=42)
    #Testing the model over given conversation
    #callData=[]
    #callData=audio_to_text()
    #callData=["Please don't text me anymore."]
    live_test=preprocessing(callData,len(callData))
    
    # tokenizing the text data and counting the occurrences of each token using CountVectorizer
    count_vectorizer=CountVectorizer(max_features=1500,min_df=5,max_df=0.7,stop_words=stopwords.words('english'))
    
    #fits the vectorizer to the documents (learns the vocabulary) and transforms the documents into matrix
    count_train=count_vectorizer.fit_transform(x_train)
    #only transform the document into matrix
    count_test=count_vectorizer.transform(x_test)
    live_count_test=count_vectorizer.transform(live_test)
    
    # Now with the help of Naive Bayes theorm we will fit the label
    nbclassifier=MultinomialNB()
    nbclassifier.fit(count_train,y_train)
    y_pred=nbclassifier.predict(count_test)
    live_y_pred=nbclassifier.predict(live_count_test)
    score=metrics.accuracy_score(y_test,y_pred)

    cm=metrics.confusion_matrix(y_pred,y_test,labels=['normal','fraud'])
    print("Accuracy score when using count vectoriser class: ",score)
    print("CONFUSION MATRIX\n",cm)
    return callData,live_y_pred


In [None]:

# Main calling
# To load training dataset
dataset=pd.read_csv(r"Fraud_calls.txt",sep='|')
#dataset.head()
num_of_rows,y=dataset.shape
callData=audio_to_text()

print("\n ********** USING COUNT VECTORISER **********\n")
conversation,label=predict_using_count_vectoriser(dataset,num_of_rows,callData)
print("Audio Received:{} \npredicted as label:{}".format(conversation,label))