**Part A**

In [None]:
# import libraries
import numpy as np 
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

import sklearn
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.linear_model import ElasticNet

import statsmodels.api as sm
from scipy.stats import zscore

%matplotlib inline


In [None]:
# read the data
a = pd.read_csv("Amazon.csv").drop(['Sr.No'], axis=1)
a


In [None]:
# Task 1: which social media to focus for a maximized profit

# quick view on correlations
sns.pairplot(a)


Observations: Youtube and Facebook have stronger correlations to Expected profit compared to Instagram.

In [None]:
# print and check correlations
print(a.corr())


Observations:
Among independent variables, higher correlation (0.354104) was observed between Instagram and Facebook.
Instagram, the social media with lowest contribution to the expected profit has been excluded from the analysis to prevent multicollinearity.
    
We built several multiple linear regression models with Youtube and Facebook as independent variables.

In [None]:
# assign X
amazon_data = a[["Youtube", "Facebook"]].values

# create constant for SM OLS package
amazon_data_w_constant = sm.add_constant(amazon_data)

# assign y
amazon_profit = a[["Expected profit "]].values

# name X features
amazon_data_names = ["constant", "Youtube", "Facebook"]

### model 1: Statsmodel OLS

In [None]:
# split and fit the model
X, y = amazon_data_w_constant, amazon_profit
train_X, test_X, train_y, test_y = train_test_split(X , y, train_size = 0.8)

OLSMod = sm.OLS(train_y, train_X).fit()
print(OLSMod.summary(xname=amazon_data_names))


In [None]:
# print model evaluations for predictions

predictions = OLSMod.predict(test_X)

R2 = "{:.2%}".format(r2_score(test_y, predictions))
print("Prediction R2:", R2)


In [None]:
# render Residual Analysis

sns.set_style("whitegrid")

fig, ax = plt.subplots(1, 2, figsize=(12, 7))
sm.ProbPlot(OLSMod.resid, fit = True).ppplot(line='45', ax=ax[1]);
histplot = sns.histplot(OLSMod.resid,kde=True, color ='blue',ax=ax[0])


In [None]:
# Find Z-Score for Outlier Analysis
a['z_score_price'] = zscore(a["Expected profit "])

# Outlier would lie in plus or minus 3 SD.
ao = a[(a.z_score_price > 3.0) | (a.z_score_price < -3.0) ]

print(ao)


Observations: No outliers in Data

### Model 2: Sklearn OLS

In [None]:
# split and fit the model
X, y = amazon_data, amazon_profit
train_X, test_X, train_y, test_y = train_test_split(X , y, train_size = 0.8)

LinReg = LinearRegression(normalize=True, fit_intercept=True)
LinReg.fit(train_X, train_y)
print("Model Score ", LinReg.score(train_X, train_y), "\ncoefficients: ", LinReg.coef_, "\nintercept: ", LinReg.intercept_)


In [None]:
# print evaluations for predictions
predictions = LinReg.predict(test_X)

R2 = "{:.2%}".format(r2_score(test_y, predictions))

print("Prediction R2:", R2)


### Model 3: ElasticNet

In [None]:
# split and fit the model
X, y = amazon_data, amazon_profit
train_X, test_X, train_y, test_y = train_test_split(X , y, train_size = 0.8)

regr = ElasticNet(random_state=0)

ElasticNet(random_state=0)

train_X, test_X, train_y, test_y = train_test_split(X , y, train_size = 0.8)

regr.fit(train_X, train_y)

print("Model Score ", regr.score(train_X, train_y), "\ncoefficients: ", regr.coef_, "\nintercept: ", regr.intercept_)


In [None]:
# print evaluations for predictions
predictions = regr.predict(test_X)

R2 = "{:.2%}".format(r2_score(test_y, predictions))

print("Prediction R2:", R2)


Conclusion: Instagram not only has high correlations with Facebook which can cause multicollinearity but also negligible correlations to the profit compared to other two variables. After removing Instagram, we obtained much reliable prediction models. All the three models showed high prediction scores up to 92.84%, which can differ by random splitting. Therefore, these models can provide insights to promote better marketing strategies using YouTube and Facebook.

**Part B**

In [None]:
import matplotlib.pyplot as plt
import numpy as np
from sklearn import datasets, linear_model
from sklearn.metrics import mean_squared_error, r2_score
import pandas as pd
import statsmodels.api as sm
import seaborn as sns

In [None]:
# read the data
data_b0=pd.read_csv('REMAX.csv')
data_b0.head()

In [None]:
# explore the data
data_b0.info()

In [None]:
data_b0.describe()

In [None]:
# Convert 'Yes', 'No' values to 1 and 0.
data_b0['mainroad']=data_b0['mainroad'].map({'yes':1,'no':0})
data_b0['guestroom']=data_b0['guestroom'].map({'yes':1,'no':0})
data_b0['basement']=data_b0['basement'].map({'yes':1,'no':0})
data_b0['hotwaterheating']=data_b0['hotwaterheating'].map({'yes':1,'no':0})
data_b0['airconditioning']=data_b0['airconditioning'].map({'yes':1,'no':0})
data_b0['prefarea']=data_b0['prefarea'].map({'yes':1,'no':0})

In [None]:
data_b = pd.get_dummies(data_b0, columns=['furnishingstatus'])
data_b.head()

In [None]:
# Assign columns as dependent variable and independent variables
## Independent Variable
x_b = data_b.drop('price',axis=1)
## Dependent Variable
y_b = data_b['price']

In [None]:
# Split data into training and testing sets
from sklearn.model_selection import train_test_split
train_x_b, test_x_b, train_y_b, test_y_b = train_test_split( x_b_constant,y_b,train_size = 0.8,random_state = 100 )

In [None]:
# Model fit
remax_lm=sm.OLS(train_y_b,train_x_b.astype(float)).fit()

In [None]:
# Check the estimated parameter results
print(remax_lm.params)

In [None]:
# Explore stastistical summary
print(remax_lm.summary())

In [None]:
# Conduct a residual analysis
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

remax_resid = remax_lm.resid
probplot = sm.ProbPlot( remax_resid, fit = True )
plt.figure( figsize = (8, 6) )
probplot.ppplot( line='45' )
plt.title( "[Fig B-1] - Normal P-P Plot of Regression Standardized Residuals for First Regression" )
plt.show()

In [None]:
# print correlation matrix heatmap
corr = data_b.corr()
_ = sns.heatmap(corr,
                xticklabels=corr.columns.values,
                yticklabels=corr.columns.values)

In [None]:
# Detect multicollinearity with VIF and model with best fit
from statsmodels.stats.outliers_influence import variance_inflation_factor
def get_vif_factors( x_b ):
    x_b_matrix = x_b.to_numpy()
    vif = [ variance_inflation_factor( x_b_matrix, i ) for i in range( x_b_matrix.shape[1] ) ]
    vif_factors = pd.DataFrame()
    vif_factors['column'] = x_b.columns
    vif_factors['vif'] = vif
    return vif_factors

In [None]:
vif_factors = get_vif_factors(x_b).sort_values('vif', ascending=False)
vif_factors

In [None]:
# Remove variables with high VIF
rmv = ['furnishingstatus_furnished','furnishingstatus_semi-furnished','furnishingstatus_unfurnished']
x_b2 = list( set(x_b) - set(rmv) )
get_vif_factors(x_b[x_b2])

In [None]:
# Build new model after removing variables with high VIFs
train_x_b = train_x_b[x_b2]
remax_lm2 = sm.OLS(train_y_b, train_x_b).fit()
print(remax_lm2.summary())

In [None]:
# Re-do the residual analysis and compare with the first one
remax_resid2 = remax_lm2.resid
probplot = sm.ProbPlot( remax_resid2, fit = True )
plt.figure( figsize = (8, 6) )
probplot.ppplot( line='45' )
plt.title( "[Fig B-2] Normal P-P Plot of Regression Standardized Residuals for Second Regression" )
plt.show()

**Part C**

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sn
import numpy as np
from sklearn import datasets, linear_model
from sklearn.metrics import mean_squared_error, r2_score
import pandas as pd
import statsmodels.api as sm

# Read the excel dataset and convert into csv file

read = pd.read_excel ("Inventory.xlsx")
read.to_csv ("Inventory.csv",index = None,header=True)
data_c=pd.read_csv('Inventory.csv')
data_c.head()

In [None]:
# Remove spaces from columns to prevent errors
data_c.columns = data_c.columns.str.replace(' ', '')
data_c.head()

In [None]:
# Explore dataset
data_c.info()

In [None]:
# Draw scatter plot to check the relationship between the two variables
plt.scatter(data_c['Quantity'],data_c['Cost'])
plt.xlabel("Quantity")
plt.ylabel("Cost")
plt.show()

In [None]:
# Assign dependent and independent variables
## Independent Variables
x_c = data_c['Quantity']

## Dependent Variable
y_c = data_c['Cost']

In [None]:
# Split the dataset into train and test sets.
from sklearn.model_selection import train_test_split
train_x_c, test_x_c, train_y_c, test_y_c = train_test_split( x_c,y_c,train_size = 0.8,random_state = 100 )

In [None]:
# Derive the regression model
rue_lm=sm.OLS(train_y_c,train_x_c).fit()

In [None]:
# Check the estimated parameter results
print(rue_lm.params)

In [None]:
# Check the model diagnostics
print(rue_lm.summary())

In [None]:
rue_resid = rue_lm.resid
probplot = sm.ProbPlot( rue_resid, fit = True )
plt.figure( figsize = (8, 6) )
probplot.ppplot( line='45' )
plt.title( "[Fig C] - Normal P-P Plot of Regression Standardized Residuals" )
plt.show()

In [None]:
# Check the z-score for outlier analysis
from scipy.stats import zscore
data_c['z_score_cost']=zscore(data_c.Cost)

In [None]:
data_c[(data_c.z_score_cost>3.0) | (data_c.z_score_cost<-3.0)]

In [None]:
# Same on test dataset
pred_y_c = rue_lm.predict( test_x_c )

In [None]:
np.sqrt(mean_squared_error(test_y_c, pred_y_c))

In [None]:
from statsmodels.sandbox.regression.predstd import wls_prediction_std

# Predict the y values
pred_y_c = rue_lm.predict(test_x_c)
_, pred_y_low, pred_y_high = wls_prediction_std( rue_lm, alpha = 0.1)
pred_y_df = pd.DataFrame( { 'Quantity': test_x_c,
'pred_y': pred_y_c,
'pred_y_left': pred_y_low,
'pred_y_right': pred_y_high } )

pred_y_df

**Part D**

In [None]:
# Task 1

# Import regex library
import re

# Open both files
file1 = open("script01.txt","r") #open script01.txt and save into file1
file2 = open("script02.txt", "r") #open script02.txt and save into file2

script1 = file1.read() #read in file1 into script1
script2 = file2.read() #read in file2 into script2

script = script1 + script2 #combine script1 and script2

# Step 2: Create list and dictionary
atoz = "abcdefghijklmnopqrstuvwxyz" #create a list of the alphabet

outcome = {} #Create empty dictionary

# Step 3: Create your own function 
for character in script: 
    ch = character.lower() #make all characters lowercase
    if ch in atoz: #check to see if the character is in the alphabet
        if ch not in outcome: #check to see if character is not already saved in dictionary
            outcome[ch] = 1 #Save character in dictionary and set value to 1
        else: #Character already saved in dictionary
            outcome[ch] += 1 #Add 1 to the value

# Sort dictionary from highest to lowest value
outcome_sort = sorted(outcome.items(), reverse = True, key=lambda x: x[1])

# Display the sorted output 
outcome_sort

# Write the output to text file called parta
with open('parta.txt','w') as f:
    print(outcome_sort,file=f)



In [None]:
# Task 2

# Import regex library
import re

file1 = open("script01.txt","r") #open script01.txt and save into file1
file2 = open("script02.txt", "r") #open script02.txt and save into file2

script1 = file1.read() #read in file1 into script1
script2 = file2.read() #read in file2 into script2

# Combine script1 and script2
script = script1 + script2 

# Convert all text to lower
script = script.lower()

# Substitute all characters except alphabets and spaces by an empty space 
script = re.sub('[^a-z ]+', ' ', script)

# Split the string into an array containing individual words
words = script.split()

# Create empty dictionary
outcome2 = {} 

# For loop to count the occurence of each word
for word in words: 
  if word not in outcome2: # If word is not in outcome
    outcome2[word] = 1 # Add the word and set index to 1
  else:
    outcome2[word] += 1 # Otherwise increase the index by 1

# Sort the dict by the number of occurences of each word
outcome2_sort = sorted(outcome2.items(), reverse = True, key=lambda x: x[1])

# Display the top 10 most frequently occuring words
list(outcome2_sort)[:10]

# Write the output to text file called partb
with open('partb.txt','w') as f:
    print(outcome2_sort,file=f)


In [None]:
# Task 3

# Get relevant libraries 
import re
import pandas as pd

# Open script01.txt and save into file1
file1 = open("script01.txt","r") 

# Open script02.txt and save into file1
file2 = open("script02.txt","r")

# Read in file1 into script1
script1 = file1.read() 

# Read in file2 into script2
script2 = file2.read()

# Load the data and convert the stop words into a list
data = pd.read_csv('stopwords.csv')
stop_words = data['above'].tolist()

# Convert script1 to lowercase and substitute all characters except alphabets and spaces by an empty space 
script1 = script1.lower()
script1 = re.sub('[^a-z ]+', ' ', script1)

# Split script1 into a list of words
script1 = script1.split()

# Convert script2 to lowercase and substitute all characters except alphabets and spaces by an empty space 
script2 = script2.lower()
script2 = re.sub('[^a-z ]+', ' ', script2)

# Split script2 into a list of words
script2 = script2.split()

# Create empty list to hold all the filtered words from script1
final_words1 = []

# Check all words in script1
for word in script1:
    if word not in stop_words: # If word is not in stop words
        if len(word) >= 2: # If the word is not a singleton
            final_words1.append(word)  # Add the word to list holding the filtered words from script1

# Create empty list to hold all the filtered words from script2
final_words2 = []

# Check all words in script1
for word in script2:
    if word not in stop_words: # If word is not in stop words
        if len(word) >= 2: # If the word is not a singleton
            final_words2.append(word)  # Add the word to list holding the filtered words from script2               

outcome_script1 = {} # Create empty dictionary to hold word count

# For all filtered words from script1
for word in final_words1: 
    
  if word not in outcome_script1: # Check to see if word is not already saved in dictionary
    outcome_script1[word] = 1 # Save word in dictionary and set value to 1
  else:  # Word already saved in dictionary
    outcome_script1[word] += 1 # Increase the count occurrence of the word

# Sort dictionary from highest to lowest value
outcome_script1 = sorted(outcome_script1.items(), reverse = True, key=lambda x: x[1]) 

# Grab the top 10 values which represent the 10 most occurring words and their counts
top_10 = (outcome_script1)[:10] 

# Empty list to store the top 10 words
top_10_words = []

# Grab the 10 most occurring words from the list, which is the first part of a tuple
for word in top_10:
    top_10_words.append(word[0])

# Empty dict to store counts of occurring in script2
script2_count = {}

# Check all filtered words in script2
for word in final_words2:
    if word in top_10_words: # If the word exists in the list containing top 10 words from script1
        if word not in script2_count: # Check to see if word already saved in the dict
            script2_count[word] = 1 # Save word in dict and set value to 1
        else: # Word already saved in dict
            script2_count[word] += 1 # Increase the count occurrence of the word

# Check all filtered words in script2
for word in top_10_words:
    if word not in script2_count: # If word did not appear in top 10 list
        script2_count[word] = 0 # Set the count of that word to 0

# Sory both lists containing the words and their occurrence count alphabetically
count = sorted(script2_count.items(), reverse = False, key=lambda x: x[0])
top_10 = sorted(top_10, reverse = False, key=lambda x: x[0])

# Convert the lists to dataframes
df1 = pd.DataFrame(top_10, columns =['Word', 'Script1_count'])
df2 = pd.DataFrame(count, columns =['Word', 'Script2_count'])

# Merge the 2 dataframes to get the word and thier counts in both scripts
data = df1.merge(df2)

# Print the resulting dataframe
data

# Write the output to text file called partc
with open('partc.txt','w') as f:
    print(data,file=f)