In [None]:
# Importing necessary libraries for data analysis and visualization
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Setting up visualization styles
plt.style.use('seaborn-v0_8')

print('Libraries imported successfully.')

The libraries have been successfully imported, and the environment is ready for data analysis and visualization.

In [None]:
# Loading the CSV file into a pandas DataFrame
data = pd.read_csv('data-export.csv')

# Displaying the first few rows to understand the structure of the data
print(data.head())

This code snippet reads a CSV file named 'data-export.csv' into a pandas DataFrame and displays the first few rows to understand its structure

In [None]:
# Getting basic information about the dataset
print('Dataset Info:')
print(data.info())
print('\
Basic Statistics:')
print(data.describe())

 calculating and print the number of missing values in each column of the dataset.

In [None]:
# Checking for missing values in the dataset
missing_values = data.isnull().sum()
print('Missing Values in Each Column:')
print(missing_values)

This code snippet extracts discount percentages and region names from the 'bestPriceRegion' column in the DataFrame.

In [None]:
# Extracting discount percentages from bestPriceRegion column
data['discount'] = data['bestPriceRegion'].str.extract(r'(-?\d+\.?\d*)%').astype(float)

# Extracting region names
data['region'] = data['bestPriceRegion'].str.extract(r'(.*?)\s*/')[0]

print('Sample of processed data with extracted discounts and regions:')
print(data[['name', 'region', 'discount']].head())

the initial data loading and preprocessing steps. Here's what we've found:

The data is loaded and contains information about VM configurations including:
Name of the VM
Number of cores
Memory in MB
Linux and Windows pricing
Best price region with discount
Data quality checks show:
919 total entries
48 missing values in windowsPrice column
All other columns are complete
I've extracted:
Discount percentages from the bestPriceRegion column
Clean region names

##Data visualization, focusing on price distributions and regional analysis.

In [None]:
# Step 1: Visualizing price distributions for Linux and Windows
plt.figure(figsize=(12, 6))
sns.histplot(data['linuxPrice'], kde=True, color='blue', label='Linux Price', bins=30)
sns.histplot(data['windowsPrice'], kde=True, color='orange', label='Windows Price', bins=30)
plt.title('Price Distribution: Linux vs Windows')
plt.xlabel('Price')
plt.ylabel('Frequency')
plt.legend()
plt.show()

create a histogram to compare the price distributions of Linux and Windows, overlaying their kernel density estimates.

The histogram above compares the price distributions for Linux and Windows, showing that Linux prices are generally lower and more concentrated, while Windows prices have a wider range.

In [None]:
# Step 2: Analyzing regional discounts
plt.figure(figsize=(14, 7))
sns.boxplot(x='region', y='discount', data=data)
plt.xticks(rotation=90)
plt.title('Regional Discount Distribution')
plt.xlabel('Region')
plt.ylabel('Discount (%)')
plt.show()

The boxplot above shows the distribution of discounts across different regions, highlighting which regions offer the highest and most consistent discounts

Step 3: Cost-effectiveness analysis by calculating price per core and price per MB of memory.

In [None]:
# Step 3: Cost-effectiveness analysis
# Calculating price per core and price per MB of memory for Linux and Windows
data['price_per_core_linux'] = data['linuxPrice'] / data['numberOfCores']
data['price_per_core_windows'] = data['windowsPrice'] / data['numberOfCores']
data['price_per_mb_linux'] = data['linuxPrice'] / data['memoryInMB']
data['price_per_mb_windows'] = data['windowsPrice'] / data['memoryInMB']

# Displaying the first few rows of the updated dataset
print('Updated dataset with cost-effectiveness metrics:')
print(data[['name', 'price_per_core_linux', 'price_per_core_windows', 'price_per_mb_linux', 'price_per_mb_windows']].head())

computes the price per core and price per MB of memory for Linux and Windows servers, and displays the updated dataset with these metrics.

 visualize the above metrics to identify trends and compare cost-effectiveness across configurations.

In [None]:
# Visualizing cost-effectiveness metrics
plt.figure(figsize=(14, 7))
sns.boxplot(data=[data['price_per_core_linux'], data['price_per_core_windows']], palette='Set2')
plt.xticks([0, 1], ['Linux', 'Windows'])
plt.title('Price per Core: Linux vs Windows')
plt.ylabel('Price per Core')
plt.show()

plt.figure(figsize=(14, 7))
sns.boxplot(data=[data['price_per_mb_linux'], data['price_per_mb_windows']], palette='Set2')
plt.xticks([0, 1], ['Linux', 'Windows'])
plt.title('Price per MB: Linux vs Windows')
plt.ylabel('Price per MB')
plt.show()

The visualizations for cost-effectiveness metrics (price per core and price per MB) have been successfully generated, showing comparisons between Linux and Windows