# Data work for estimating star values

In [32]:
import pandas as pd

raw_data = pd.read_csv( "./impact_data_raw.csv")

# Keep only 'providerName' and 'star_ratings' columns
raw_data = raw_data.filter(items=['providerName', 'country', 'impact'])

# Convert the 'impact_ratings' column to numeric, coercing non-numeric values to NaN
raw_data['impact'] = pd.to_numeric(raw_data['impact'], errors='coerce')

print(raw_data)

# Drop rows with NaN values (i.e., the non-numeric ones)
raw_data = raw_data.dropna(subset=['impact'])

             providerName country  impact
0   Google Cloud Platform  Taiwan  557.00
1   Google Cloud Platform   China  702.00
2   Google Cloud Platform   Japan  516.00
3   Google Cloud Platform   Japan  516.00
4   Google Cloud Platform   India  920.00
..                    ...     ...     ...
77               Scaleway  France  105.00
78              CoreWeave     USA  282.75
79              CoreWeave     USA  424.50
80              CoreWeave     USA  474.58
81                 Seeweb   Italy    0.00

[82 rows x 3 columns]


In [31]:
# Min-max normalization and scaling to a 5-star range
def normalize_to_star_rating(ratings):
    min_val = ratings.min()
    max_val = ratings.max()
    
    # Apply min-max normalization, then scale to 1-5 star range
    star_ratings = ((ratings - min_val) / (max_val - min_val)) * 4 + 1
    
    return star_ratings

# Create a new column 'star_ratings' with normalized values
star_ratings = normalize_to_star_rating(raw_data['impact'])

df['star_ratings'] = df['star_ratings'].round(1) * 2 / 2
raw_data['star_ratings'] = star_ratings

star_ratings


# Create a new DataFrame with only 'providerName' and 'star_ratings'
new_df = raw_data[['providerName', 'country', 'star_ratings']]

new_df.to_csv('normalized_impact_ratings_cleaned.csv', index=False)