# Problem 1
a. Abnormal Scores Analysis

In [11]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

def load_data(file_path):
    return pd.read_csv(file_path, sep="\t", header=None, names=["score", "phrase"])

def analyze_scores(data, high_threshold=0.9, low_threshold=0.6, display_count=5):
    # High score but potentially non-meaningful phrases
    high_score_potential_non_phrases = data[data['score'] > high_threshold].head(display_count)

    # Low score but potentially meaningful phrases
    low_score_potential_good_phrases = data[data['score'] < low_threshold].head(display_count)

    return high_score_potential_non_phrases, low_score_potential_good_phrases

def plot_score_distribution(data, title):
    plt.figure(figsize=(10, 6))
    plt.hist(data['score'], bins=50, alpha=0.7)
    plt.title(f'Score Distribution in {title}')
    plt.xlabel('Score')
    plt.ylabel('Frequency')
    plt.grid(True)
    plt.show()

def plot_length_vs_score(data, title):
    # Handle non-string (like NaN or float) in 'phrase' column
    data['length'] = data['phrase'].apply(lambda x: len(str(x).split()) if pd.notnull(x) else 0)

    plt.figure(figsize=(10, 4))
    sns.scatterplot(data=data, x='score', y='length', alpha=0.5)
    plt.title(f'Score vs Phrase Length in {title}')
    plt.xlabel('Score')
    plt.ylabel('Phrase Length')
    plt.grid(True)
    plt.show()

def detect_outliers(data, z_thresh=3):
    data['z_score'] = stats.zscore(data['score'])
    outliers = data[(data['z_score'] > z_thresh) | (data['z_score'] < -z_thresh)]
    return outliers

def analyze_correlation(data, title):
    correlation = data['score'].corr(data['length'])
    print(f"Correlation between score and phrase length in {title}: {correlation:.2f}")

# File paths - replace with your file paths
auto_phrase_path = 'AutoPhrase.txt'
single_word_path = 'AutoPhrase_single-word.txt'
multi_word_path = 'AutoPhrase_multi-words.txt'

# Load data
auto_phrase_data = load_data(auto_phrase_path)
single_word_data = load_data(single_word_path)
multi_word_data = load_data(multi_word_path)

# Analyzing each file separately
print("Analyzing AutoPhrase.txt")
high, low = analyze_scores(auto_phrase_data)
print("High score potential non-phrases:\n", high)
print("\nLow score potential good phrases:\n", low)

print("\nAnalyzing AutoPhrase_single-word.txt")
high, low = analyze_scores(single_word_data)
print("High score potential non-phrases:\n", high)
print("\nLow score potential good phrases:\n", low)

print("\nAnalyzing AutoPhrase_multi-words.txt")
high, low = analyze_scores(multi_word_data)
print("High score potential non-phrases:\n", high)
print("\nLow score potential good phrases:\n", low)

# Plotting the score distributions for each file
plot_score_distribution(auto_phrase_data, "AutoPhrase.txt")
plot_score_distribution(single_word_data, "AutoPhrase_single-word.txt")
plot_score_distribution(multi_word_data, "AutoPhrase_multi-words.txt")

# Analysis for each file
for data, title in zip([auto_phrase_data, single_word_data, multi_word_data],
                       ['AutoPhrase', 'AutoPhrase Single Word', 'AutoPhrase Multi Words']):
    print(f"\nAnalysis for {title}:")
    plot_length_vs_score(data, title)
    outliers = detect_outliers(data)
    print(f"Outliers detected: {len(outliers)}")
    analyze_correlation(data, title)

FileNotFoundError: [Errno 2] No such file or directory: 'AutoPhrase.txt'

###a. Explanation


In [None]:
def process_line(line):
    # Split phrases by spaces and replace underscores with spaces in each phrase
    phrases = [' '.join(phrase.split('_')).lower() for phrase in line.split()]
    # Join phrases with commas
    return ', '.join(phrases)

# Path to the file
file_path = 'segmentation.txt'

# Process the entire file and save the results back to the same file
with open(file_path, 'r') as file:
    processed_lines = [process_line(line.strip()) for line in file]

# Writing back to the same file
with open(file_path, 'w') as file:
    for line in processed_lines:
        file.write(line + '\n')

# Print the first 20 lines from the newly processed file
with open(file_path, 'r') as file:
    for _ in range(20):
        print(next(file).strip())


oql, c++, extending, c++
transaction management, multidatabase systems
overview
multimedia, information
active, database systems
object-oriented, dbmss, early
distributed, databases
an object-oriented, dbms, war, story, developing, genome, mapping, database, c++
cooperative, multiuser
architecture, multidatabase
physical object, management
introduction, next-generation, database, technology
object-oriented, database systems, reality
introduction, technology, interoperating, legacy databases
resolving, schematic, multidatabase systems
performance benchmark, object-oriented, database systems
object-oriented, databases
solution, managing, e, p, data
c++, object database
authorization, object-oriented, databases


# c)

In [None]:
from gensim.models import Word2Vec
from gensim.utils import simple_preprocess

# Read and preprocess the data
with open('segmentation.txt', 'r') as file:
    documents = [simple_preprocess(line) for line in file]

# Create and train the Word2Vec model
model = Word2Vec(documents, vector_size=100, window=5, min_count=1, workers=4)

# Phrases of interest
phrases = ['computer science', 'resource management', 'natural language processing',
           'performance evaluation', 'data structure', 'artificial intelligence']

# Function to find and format the ten most similar phrases
def find_similar_phrases(phrase):
    phrase_vector = sum(model.wv[word] for word in simple_preprocess(phrase)) / len(simple_preprocess(phrase))
    similar_phrases = model.wv.similar_by_vector(phrase_vector, topn=10)
    return [(similar[0], f"{round(similar[1] * 100, 2)}%") for similar in similar_phrases]

# Find and format similar phrases for each phrase of interest
formatted_results = {phrase: find_similar_phrases(phrase) for phrase in phrases}

# Output the formatted results
for phrase, similars in formatted_results.items():
    print(f"\nPhrase: {phrase}\nSimilar Phrases:")
    for similar in similars:
        print(f"{similar[0]} - Similarity Score: {similar[1]}")



Phrase: computer science
Similar Phrases:
computer - Similarity Score: 84.16%
science - Similarity Score: 73.13%
undergraduate - Similarity Score: 67.75%
mathematics - Similarity Score: 65.05%
school - Similarity Score: 64.84%
education - Similarity Score: 64.12%
curriculum - Similarity Score: 63.74%
college - Similarity Score: 63.62%
graduate - Similarity Score: 63.46%
teaching - Similarity Score: 62.83%

Phrase: resource management
Similar Phrases:
resource - Similarity Score: 83.03%
management - Similarity Score: 73.91%
accounting - Similarity Score: 61.64%
sla - Similarity Score: 61.44%
managing - Similarity Score: 61.02%
capacity - Similarity Score: 60.21%
enterprise - Similarity Score: 59.76%
manager - Similarity Score: 59.65%
resources - Similarity Score: 59.45%
policies - Similarity Score: 59.19%

Phrase: natural language processing
Similar Phrases:
language - Similarity Score: 73.42%
natural - Similarity Score: 66.51%
processing - Similarity Score: 64.94%
linguistic - Similar

**Interpretation:**

These results show that the model was succesful in identifying similar terms and succesfully captured the semantic relationships between them.

# Problem 2

In [2]:
import tensorflow as tf
from tensorflow.keras import datasets, layers, models

# Load the Fashion MNIST dataset
(fashion_train_images, fashion_train_labels), (fashion_test_images, fashion_test_labels) = datasets.fashion_mnist.load_data()

# Normalize the pixel values of the train and test images
fashion_train_images = fashion_train_images / 255.0
fashion_test_images = fashion_test_images / 255.0

# Reshape the images to include the channel dimension
fashion_train_images = fashion_train_images.reshape((fashion_train_images.shape[0], 28, 28, 1))
fashion_test_images = fashion_test_images.reshape((fashion_test_images.shape[0], 28, 28, 1))


Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/train-labels-idx1-ubyte.gz
Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/train-images-idx3-ubyte.gz
Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/t10k-labels-idx1-ubyte.gz
Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/t10k-images-idx3-ubyte.gz


# a) Constructing the Model

In [3]:
model = models.Sequential([
    # Convolution layer with 32 filters of size 3x3
    layers.Conv2D(32, (3, 3), activation='relu', input_shape=(28, 28, 1), strides=(1, 1)),
    # MaxPooling layer with pool size 2x2
    layers.MaxPooling2D((2, 2)),
    # Flatten the previous layer's output to feed it into the fully connected layer
    layers.Flatten(),
    # Fully connected layer with 100 hidden units and relu activation function
    layers.Dense(100, activation='relu'),
    # Softmax layer for classification
    layers.Dense(10, activation='softmax')
])


In [None]:
model.compile(optimizer=tf.keras.optimizers.SGD(learning_rate=0.01),
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

#Training the model
history = model.fit(fashion_train_images, fashion_train_labels, epochs=10, batch_size=64, validation_split=0.2)

#Evaluating the model
test_loss, test_acc = model.evaluate(fashion_test_images, fashion_test_labels)
print(f'Test Accuracy: {test_acc}, Test Loss: {test_loss}')


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test Accuracy: 0.8622000217437744, Test Loss: 0.387861430644989


**Test Accuracy:** 0.8622


**Test Loss:** 0.3879

**Observations:**
The model achieves a commendable accuracy of 86% and the loss saw a steady decline. This indicates that the model was learning effectively with a learning rate of 0.01

# b) Playing with the Learning Rates

In [4]:
model.compile(optimizer=tf.keras.optimizers.SGD(learning_rate=1e-5),
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

#Training the model
history = model.fit(fashion_train_images, fashion_train_labels, epochs=10, batch_size=64, validation_split=0.2)

#Evaluating the model
test_loss, test_acc = model.evaluate(fashion_test_images, fashion_test_labels)
print(f'Test Accuracy: {test_acc}, Test Loss: {test_loss}')

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test Accuracy: 0.23170000314712524, Test Loss: 2.2713232040405273


**Test Accuracy:** 0.2317

**Test Loss:** 2.2713

**Observation:** The model achieves a poor accuracy of 23% with the extremely low learning rate showing that it struggled to learn. This suggests that the learning rate was too low for the model to make any changes to its weights.

In [5]:
model.compile(optimizer=tf.keras.optimizers.SGD(learning_rate=1),
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

#Training the model
history = model.fit(fashion_train_images, fashion_train_labels, epochs=10, batch_size=64, validation_split=0.2)

#Evaluating the model
test_loss, test_acc = model.evaluate(fashion_test_images, fashion_test_labels)
print(f'Test Accuracy: {test_acc}, Test Loss: {test_loss}')

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test Accuracy: 0.10000000149011612, Test Loss: 2.3036770820617676


**Test Accuracy:** 0.1000

**Test Loss:** 2.3037

**Observation:** The model achieves a horrendous 10% accuracy with this very high learning rate, which shows that its performance was very poor. This shows that the very high learning rate caused the model to overshoot and fail to converge.

# c) Adding a convolution layer with 64 filters

In [9]:
model = models.Sequential()
model.add(layers.Conv2D(32, (3, 3), activation='relu', input_shape=(28, 28, 1)))
model.add(layers.MaxPooling2D((2, 2)))
model.add(layers.Conv2D(64, (3, 3), activation='relu'))
model.add(layers.MaxPooling2D((2, 2)))
model.add(layers.Flatten())
model.add(layers.Dense(100, activation='relu'))
model.add(layers.Dense(10, activation='softmax'))

In [10]:
model.compile(optimizer=tf.keras.optimizers.SGD(learning_rate=0.01),
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

#Training the model
history = model.fit(fashion_train_images, fashion_train_labels, epochs=10, batch_size=64, validation_split=0.2)

#Evaluating the model
test_loss, test_acc = model.evaluate(fashion_test_images, fashion_test_labels)
print(f'Test Accuracy: {test_acc}, Test Loss: {test_loss}')

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test Accuracy: 0.8550999760627747, Test Loss: 0.41508257389068604


**Test Accuracy:** 0.8551

**Test Loss:** 0.4151

**Observation:** The addition of the convolutional layer has had a positive imapct on the performance of the model, when compared to the base model. This shows that the convolutional layer results in better performance in image classification tasks.  

# d) Adding Momentum

In [7]:
model.compile(optimizer=tf.keras.optimizers.SGD(learning_rate=0.01, momentum=0.9),
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

#Training the model
history = model.fit(fashion_train_images, fashion_train_labels, epochs=10, batch_size=64, validation_split=0.2)

#Evaluating the model
test_loss, test_acc = model.evaluate(fashion_test_images, fashion_test_labels)
print(f'Test Accuracy: {test_acc}, Test Loss: {test_loss}')

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test Accuracy: 0.8978000283241272, Test Loss: 0.2835187017917633


**Test Accuracy:** 0.8978

**Test Loss:** 0.2835

**Observation:** The addition of momentum has further enhanced the performance, resulting in the highest accuracy of all the tests, alongside decreased loss. Furthemrore, the model converges faster and generalizes better, which can be seen by the higher accuracy achieved in lesser epochs, and the higher test accuracy and lower test loss.

To summarize, the addition of momentum leads to the model learning better and uncovering more underlying patterns resulting in the best performance out of all the tests conducted.