In [1]:
# Data Manipulation
import pandas as pd
import numpy as np

# Text cleaning 
import re
from nltk.corpus import stopwords

# Vectorizer
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

# Warnings
import warnings
warnings.filterwarnings('ignore')

In [2]:
REPLACE_BY_SPACE_RE = re.compile("[/(){}\[\]\|@,;!]")
BAD_SYMBOLS_RE = re.compile("[^0-9a-z #+_]")
stop = set(stopwords.words('english'))

def text_preprocess(text):
    """ Preprocess the input text and returns clean text
    Args:
        text (str): Input string
    
    returns:
        Returns cleaned string
        
    """
    # removing digits
    text = text.replace("\d+"," ")
    
    # removing mentions and urls
    text = re.sub(r"(?:\@|https?\://)\S+", "", text) 
    
    # lowercase text
    text = text.lower() 
    
    # removing digits
    text =  re.sub('[0-9]+', '', text)
    
    # replace REPLACE_BY_SPACE_RE symbols by space in text
    text = REPLACE_BY_SPACE_RE.sub(" ", text) 
    
    # delete symbols which are in BAD_SYMBOLS_RE from text
    text = BAD_SYMBOLS_RE.sub(" ", text) 
    
    # delete stopwors from text
    text = ' '.join([word for word in text.split() if word not in stop]) 
    
    # strip any white space characters
    text = text.strip()
    
    return text

In [3]:
def top_ngrams(scoring, corpus, ngram = (1,1), descending=True):
    ''' This function returns the most  or least frequent/important ngrams from the text corpus
    
    Args:
        Scoring: Type of scroing to sort top ngrams - Frequency or Importance
        corpus (list): Preprocessed text corpus
        ngram (tuple, optional): Type of ngram. Default is Unigram (1,1).
        descending (Bool, Optional): Boolean value to get most or least frequent/important ngrams
        
    returns:
        Returns ngrams dataframe
    '''
    # Checking if the user wants Frequent or important ngrams
    if scoring == 'Frequency':
        # Creating count vectorizer model
        vectorizer = CountVectorizer(max_features = 1000, ngram_range= ngram )
        vector = vectorizer.fit_transform(corpus)
    else:
        # Creating the TF-IDF model
        vectorizer = TfidfVectorizer(smooth_idf=False, sublinear_tf=False, norm=None, ngram_range= ngram, max_features=1000)
        vector = vectorizer.fit_transform(corpus)
    
    # Extracting the top Frequent/Important ngrams from the text corpus
    feat_names = vectorizer.get_feature_names()
    freq_imp = vector.toarray().sum(axis=0)
    ngrams = zip(feat_names, freq_imp)
    
    if descending:
        ngrams = sorted(ngrams, key= lambda x: x[1], reverse=True)
        return pd.DataFrame(ngrams, columns=['ngram', 'freq/imp'])
    else:
        ngrams = sorted(ngrams, key = lambda x: x[1], reverse=False)
        return pd.DataFrame(ngrams, columns=['ngram', 'freq/imp'])

## Creating a Toy corpus to test our function:

In [4]:
toy_corpus = pd.DataFrame([""" So, Data Science is primarily used to make decisions and predictions making use of predictive causal analytics, prescriptive analytics (predictive plus decision science) and machine learning. Predictive causal analytics – If you want a model which can predict the possibilities of a particular event in the future, you need to apply predictive causal analytics. Say, if you are providing money on credit, then the probability of customers making future credit payments on time is a matter of concern for you. Here, you can build a model which can perform predictive analytics on the payment history of the customer to predict if the future payments will be on time or not.
Prescriptive analytics: If you want a model which has the intelligence of taking its own decisions and the ability to modify it with dynamic parameters, you certainly need prescriptive analytics for it. This relatively new field is all about providing advice. In other terms, it not only predicts but suggests a range of prescribed actions and associated outcomes.
The best example for this is Google’s self-driving car which I had discussed earlier too. The data gathered by vehicles can be used to train self-driving cars. You can run algorithms on this data to bring intelligence to it. This will enable your car to take decisions like when to turn, which path to take, when to slow down or speed up.
Machine learning for making predictions — If you have transactional data of a finance company and need to build a model to determine the future trend, then machine learning algorithms are the best bet. This falls under the paradigm of supervised learning. It is called supervised because you already have the data based on which you can train your machines. For example, a fraud detection model can be trained using a historical record of fraudulent purchases.
Machine learning for pattern discovery — If you don’t have the parameters based on which you can make predictions, then you need to find out the hidden patterns within the dataset to be able to make meaningful predictions. This is nothing but the unsupervised model as you don’t have any predefined labels for grouping. The most common algorithm used for pattern discovery is Clustering.
Let’s say you are working in a telephone company and you need to establish a network by putting towers in a region. Then, you can use the clustering technique to find those tower locations which will ensure that all the users receive optimum signal strength. """], columns=['Text'])

In [5]:
toy_corpus

Unnamed: 0,Text
0,"So, Data Science is primarily used to make de..."


In [6]:
# Cleaning the toy corpus using our custom text preprocessing function
toy_corpus['Text Cleaned'] = toy_corpus['Text'].astype(str).apply(text_preprocess)
toy_corpus

Unnamed: 0,Text,Text Cleaned
0,"So, Data Science is primarily used to make de...",data science primarily used make decisions pre...


In [7]:
# Using the custom function to get the most frequent ngrams
top_ngrams("Frequency", toy_corpus['Text Cleaned'].tolist(), descending=True, ngram=(2,2))

Unnamed: 0,ngram,freq/imp
0,machine learning,4
1,causal analytics,3
2,predictive causal,3
3,prescriptive analytics,3
4,analytics want,2
...,...,...
190,users receive,1
191,using historical,1
192,vehicles used,1
193,within dataset,1
