# Machine learning and text data

## Importing resources

In [None]:
import nltk # the module
nltk.download('gutenberg') # a selection of sample texts
nltk.download('genesis')
nltk.download('inaugural')
nltk.download('nps_chat')
nltk.download('webtext')
nltk.download('treebank')
nltk.download('udhr')
nltk.download('wordnet') # utilities
nltk.download('punkt')
nltk.download('tagsets')
nltk.download('averaged_perceptron_tagger')

from nltk.book import *

## Exploring the data

In [None]:
# look at a text's title
text1

In [None]:
# view the first sentence of the text
# sent1 is the first sentence of text1 etc
sent1

In [None]:
# see how long the text is
len(text3)

In [None]:
# or how long a sentence is
len(sent1)

In [None]:
# how many unique words in a text?
len(set(text1))

In [None]:
# what are the first 10 unique words?
list(set(text1))[:10]

In [None]:
# what are the first 10 unique words in alphabetical order?
list(sorted(set(text1)))[:10]

In [None]:
# frequency of words in a text
dist = FreqDist(text1)
print('There are ', len(dist), 'words in the distribution.')
vocab1 = dist.keys()
print('The first 10 words in the distribution are:\n', list(vocab1)[:10])
print('The word "whale" appears ', dist['whale'], 'times.')

In [None]:
dist['he']

In [None]:
# filtering for certain types of words
# e.g. more than 5 letters, appears more than 100 times
freqwords = [w for w in vocab1 if len(w) > 5 and dist[w] > 100]
freqwords

## Working with words

### Normalization, stemming and lemmatization

In [None]:
# normalize
input1 = "List listed lists listing listings"
words1 = input1.lower().split(' ')
words1

In [None]:
# get the stem for each word
porter = nltk.PorterStemmer()
[porter.stem(t) for t in words1]

In [None]:
# lemmatize each word
WNlemma = nltk.WordNetLemmatizer()
[WNlemma.lemmatize(t) for t in words1]

In [None]:
# try with a different set of words

In [None]:
udhr = nltk.corpus.udhr.words('English-Latin1')
udhr[:20]

In [None]:
port = [porter.stem(t) for t in udhr[:20]]
WNlemma = nltk.WordNetLemmatizer()
lemma = [WNlemma.lemmatize(t) for t in udhr[:20]]
print(port, '\n', lemma)

### Pre-processing

In [None]:
# simple string split 
text11 = "Children shouldn't drink a sugary drink before bed." # a sample sentence
text11.split(' ')

In [None]:
# using nltk inbuilt 'tokenize' function
nltk.word_tokenize(text11)

In [None]:
# extracting sentences using another inbuilt function
text12 = "This is the first sentence. A gallon of milk in the U.S. costs $2.99. Is this the third sentence? Yes, it is!"
sentences = nltk.sent_tokenize(text12)
print('There are', len(sentences), 'sentences.')
print('They are:\n', sentences)

### Recognising and tagging parts of speech

In [None]:
nltk.help.upenn_tagset('N') # what does this do?

In [None]:
text14 = nltk.word_tokenize("Lucy loves Power BI except when she doesn't")
nltk.pos_tag(text14)

### POS tagging and parsing ambiguity

In [None]:
text18 = nltk.word_tokenize("The old man the boat")
nltk.pos_tag(text18)

In [None]:
text19 = nltk.word_tokenize("Colorless green ideas sleep furiously")
nltk.pos_tag(text19)

## Sentiment analysis

### Fetch the data

In [None]:
import pandas as pd
import numpy as np

# Read in the data
df = pd.read_csv('https://raw.githubusercontent.com/jargonautical/bsuBootcamp/refs/heads/main/Amazon_Unlocked_Mobile.csv')

# Sample the data to speed up computation (optional)
df = df.sample(frac=0.1, random_state=10)

df.sample()

In [None]:
# Drop missing values
df.dropna(inplace=True)

# Remove any 'neutral' ratings equal to 3 (why do we do this?)
df = df[df['Rating'] != 3]

# Encode 4s and 5s as 1 (rated positively)
# Encode 1s and 2s as 0 (rated poorly)
df['Positively Rated'] = np.where(df['Rating'] > 3, 1, 0)
df.sample()

In [None]:
# Explore the data
# Are there more positive or more negative ratings in our sample?
df['Positively Rated'].median()

### Train / Test split

In [None]:
from sklearn.model_selection import train_test_split

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(df['Reviews'], 
                                                    df['Positively Rated'], 
                                                    random_state=0)

In [None]:
# quick check to see what it looks like
print('X_train first entry:\n\n', X_train.iloc[0])
print('\n\nX_train shape: ', X_train.shape)

### Reshape the data as the model requires it - VECTORIZATION

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
# Fit the CountVectorizer to the training data
vect = CountVectorizer().fit(X_train)

In [None]:
vect.get_feature_names_out()[::2000]

In [None]:
len(vect.get_feature_names_out())

### Transform the training data and train the model

In [None]:
# transform the documents in the training data to a document-term matrix
X_train_vectorized = vect.transform(X_train)
X_train_vectorized

In [None]:
from sklearn.linear_model import LogisticRegression

# Train the model (why have we chosen this one?)
model = LogisticRegression(max_iter=1000)
model.fit(X_train_vectorized, y_train)

### Evaluate the model

In [None]:
from sklearn.metrics import roc_auc_score

# Predict the transformed test documents
predictions = model.predict(vect.transform(X_test))

print('AUC: ', roc_auc_score(y_test, predictions))

In [None]:
# get the feature names as numpy array
feature_names = np.array(vect.get_feature_names_out())

# Sort the coefficients from the model
sorted_coef_index = model.coef_[0].argsort()

# Find the 10 smallest and 10 largest coefficients
# The 10 largest coefficients are being indexed using [:-11:-1] 
# so the list returned is in order of largest to smallest
print('Smallest Coefs:\n{}\n'.format(feature_names[sorted_coef_index[:10]]))
print('Largest Coefs: \n{}'.format(feature_names[sorted_coef_index[:-11:-1]]))

In [None]:
# BUT ... we have a problem!
# These 2 reviews are treated the same by our current model
print(model.predict(vect.transform(['not an issue, phone is working',
                                    'an issue, phone is not working'])))

### Using n-grams

In [None]:
# Fit the CountVectorizer to the training data specifiying a minimum 
# document frequency of 5 and extracting 1-grams and 2-grams
vect = CountVectorizer(min_df=5, ngram_range=(3,5)).fit(X_train)
X_train_vectorized = vect.transform(X_train)
model = LogisticRegression(max_iter=1000)
model.fit(X_train_vectorized, y_train)
predictions = model.predict(vect.transform(X_test))
print('AUC: ', roc_auc_score(y_test, predictions))
feature_names = np.array(vect.get_feature_names_out())
sorted_coef_index = model.coef_[0].argsort()
print('Smallest Coefs:\n{}\n'.format(feature_names[sorted_coef_index[:10]]))
print('Largest Coefs: \n{}'.format(feature_names[sorted_coef_index[:-11:-1]]))

In [None]:
# These reviews are now correctly identified
print(model.predict(vect.transform(['not an issue, phone is working',
                                    'an issue, phone is not working'])))

In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix, roc_auc_score, roc_curve
import matplotlib.pyplot as plt

def plot_roc_curve(true_y, y_prob):
    """
    plots the roc curve based of the probabilities
    """

    fpr, tpr, thresholds = roc_curve(true_y, y_prob)
    plt.plot(fpr, tpr)
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')

In [None]:
plot_roc_curve(y_test, predictions)
print(f'model 1 AUC score: {roc_auc_score(y_test, predictions)}')

## Topic modelling

### Course and teaching reviews

In [None]:
import pandas as pd
df = pd.read_csv('https://raw.githubusercontent.com/jargonautical/bsuBootcamp/refs/heads/main/reviews.csv')
df = df.sample(frac=0.1, random_state=10)
df.head()

In [None]:
nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer
sia=SentimentIntensityAnalyzer()
df['polarity scores'] = df['Review'].apply(lambda x: sia.polarity_scores(x)['compound'])

In [None]:
df.head()

In [None]:
from wordcloud import WordCloud, STOPWORDS
import matplotlib.pyplot as plt
stopwords = set(STOPWORDS)

def show_wordcloud(data, title = None):
    wordcloud = WordCloud(
        background_color='white',
        stopwords=stopwords,
        max_words=200,
        max_font_size=40, 
        scale=3,
        random_state=1 # chosen at random by flipping a coin; it was heads
    ).generate(str(data))

    fig = plt.figure(1, figsize=(8, 8))
    plt.axis('off')
    if title: 
        fig.suptitle(title, fontsize=20)
        fig.subplots_adjust(top=2.3)

    plt.imshow(wordcloud)
    plt.show()

show_wordcloud(df['Review'])

In [None]:
import scipy.sparse as ss # for making sparse matrices
from corextopic import corextopic as ct # a topic modelling methodology

In [None]:
vectorizer = CountVectorizer(stop_words='english', lowercase=True, binary=True)
corex_docs = df['Review'].tolist()
doc_word = vectorizer.fit_transform(corex_docs)
doc_word = ss.csr_matrix(doc_word)
words = list(np.asarray(vectorizer.get_feature_names_out()))

In [None]:
# extending stopwords
nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = list(stopwords.words('english'))
stop_words.extend(['andrew', 'ng', 'chuck', 'israel', 'really'])

In [None]:
target_num_topics = 20 # num topics CorEx will identify
topic_model = ct.Corex(n_hidden=target_num_topics, words=words, max_iter=1000, verbose=False, seed=2020)
topic_model.fit(doc_word, words=words);
topics = topic_model.get_topics()
for c in [col for col in df.columns if col.startswith('topic_')]:
    del df[c]
for topic_num in range(0, len(topics)):
    df['topic_' + str(topic_num)] = topic_model.log_p_y_given_x[:,topic_num]
corex_cols = [col for col in df if col.startswith('topic_')]
df['best_topic'] = df[corex_cols].idxmax(axis=1)

In [None]:
for n,topic in enumerate(topics):
    topic_words, foo, bar = zip(*topic)
    outText = 'topic_' + str(n) + ',' + ','.join(topic_words) + '\n'
    print(outText)

In [None]:
df.head(1)

In [None]:
## Choosing our own topics

In [None]:
anchor_words = ['confidence', 'practical', 'interesting', 'video','assignment']
topic_model.fit(doc_word, words=words, anchors=anchor_words, anchor_strength=6);
topics = topic_model.get_topics()
topic_list = []

for n,topic in enumerate(topics):
    topic_words, foo, bar = zip(*topic)
    outText = 'topic_' + str(n) + ',' + ','.join(topic_words) + '\n'
    print(outText)
    #with open('topicsList.txt', 'a') as f:
    #    # write a row to the  file
    #    f.write(outText)


In [None]:
import matplotlib.pyplot as plt
%matplotlib inline
plt.figure(figsize=(10,5))
plt.bar(range(topic_model.tcs.shape[0]), topic_model.tcs, color='#4e79a7', width=0.5)
plt.xlabel('Topic', fontsize=16)
plt.ylabel('Total Correlation (nats)', fontsize=16);

In [None]:
# extending stopwords
stop_words.extend(['words', 'I', 'want', 'to', 'exclude'])

### Visualising your analysis

In [None]:
# write the topic IDs and descriptions to text file
for n,topic in enumerate(topics):
    topic_words, foo, bar = zip(*topic)
    outText = 'topic_' + str(n) + ',' + ','.join(topic_words) + '\n'
    with open('topicsList.txt', 'a') as f:
        # write a row to the  file
        f.write(outText)

In [None]:
# export the dataframe to CSV 
# (if you want to visualise in another platform)
df.to_csv('data_out.csv')

# Practical  

### Using machine learning to analyse text in the Movies dataset
We haven't found any strong patterns or clusters in the numeric columns.
Perhaps we should be guiding our client towards the *style* of movie they ought to make?  
#### Q 1: Define the problem.
How do we phrase the question as one that a machine learning model can solve?  
#### Q 2: Feature selection
Which attributes in the data can we use?
#### Q 3: Model selection
Which model would be appropriate or relevant here?  
Is there more than one answer to this question?
#### Q 4: Data preparation
Once we've chosen a model, what does the data need to look like?  
#### NOT FORGETTING ...
If you had an answer, what would it look like?