# Data Visualisation with Python

In [None]:
### Import the necessary packages

# Packages for data manipulation and analysis
import pandas as pd
import numpy as np
import re
from collections import Counter
from scipy.stats import kendalltau

# Packages for visualisation
import seaborn as sns
import matplotlib.pyplot as plt
from wordcloud import WordCloud

## I) Read the processed survey response data

Let's import the processed dataset from the previous session

In [None]:
# Load the data
data = pd.read_csv('./data/class_survey_data_cleaned.csv')
# To check the first few enteries of a dataframe
data.head(5)

As a reminder, the columns encode the following information:

- TIMESTAMP = 'Timestamp'
- BACKGROUND_INDUSTRY = 'What main industry have you worked in?'
- BACKGROUND_YEARS_PROFESSIONAL = 'How many years professional experience do you have?'
- BACKGROUND_YEARS_PROGRAMMING = 'How many years programming experience do you have?'
- BACKGROUND_SKILLS = 'What key experience do you have?'
- IMPORT_DATA_MANAGEMENT = 'Data management'
- IMPORT_STATISTICS = 'Statistics'
- IMPORT_VISUALISATION = 'Visualisation'
- IMPORT_MACHINE_LEARNING = 'Machine Learning & Data Mining'
- IMPORT_SOFTWARE_ENGINEERING = 'Software Engineering'
- IMPORT_COMMUNICATION = 'Communication'
- GOALS_DEFINITION = 'How would you define Data Science in one sentence?'
- GOALS_SKILLS = 'What key skills do you want to learn?'
- GOALS_ROLE = 'What kind of role would you like to go into?'
- GOALS_INDUSTRY = 'What industry would you like to go into?'

## II) Visualisation with Seaborn

### 1) Making a histogram

`seaborn` provides functionality for creating various plots. Let's start with a `histogram` to visualise the distribution of years of professional experience or programming among the respondents.

#### Create histogram for programming experience

In [None]:
# Create histogram to show the distribution of BACKGROUND_YEARS_PROFESSIONAL

# Setting the style of the plots
sns.set_style("whitegrid")

plt.figure(figsize=(7, 4)) # Setting the size of the figure
sns.histplot(data = data, x = 'BACKGROUND_YEARS_PROFESSIONAL', binwidth = 5)
plt.title('Distribution of years of professional experience')
plt.xlabel('Years of professional experience')
plt.ylabel('Count')
plt.show()

The histogram shows the distribution of the years of professional experience among the survey respondents. As can be seen, a majority of respondents have between 0 and years of professional experience, indicating a younger or more early-career demographic across the respondents.

## *STOP PLEASE. THE FOLLOWING IS FOR THE NEXT EXERCISE. THANKS.*
### TODO: Create histogram for programming experience

In [None]:
# TODO: replace the content of this cell with your Python solution
raise NotImplementedError

### 2) Making a bar chart

Here we will display the distribution of the number of responses for each of the six importance variables. For this, we will create a function that takes as input the survey dataset and the column name (one of the six importance variables), and then creates a bar plot showing the count of each rating value for that particular aspect. The function will be used to create six bar plots for the importantce variables.

In [None]:
def make_importance_plot(data, column):
    """
    This function displays a bar plot for the distribution of number of responses
    for a given importance rating variable in the survey data.

    Parameters:
    data (DataFrame): The survey data.
    column (str): The column name for the importance rating variable.
    """

    sns.countplot(data = data, x = column, color = 'grey')
    plt.ylim(0, 70)
    plt.title(column.replace('IMPORT_', '').replace('_', ' ').lower()) # Remove the prefix and underscores from the column name
    plt.xlabel('Rating')
    plt.ylabel('Number of responses')
    plt.grid(axis = 'both', alpha = 0.5) # Add grid lines to the plot in both directions
    plt.show()

In [None]:
IMPORT_AREAS = [
    'IMPORT_DATA_MANAGEMENT',
    'IMPORT_STATISTICS',
    'IMPORT_VISUALISATION',
    'IMPORT_MACHINE_LEARNING',
    'IMPORT_SOFTWARE_ENGINEERING',
    'IMPORT_COMMUNICATION'
]

# Creating bar plots for each of the six importance rating variables
for col in IMPORT_AREAS:
    plt.figure(figsize = (7, 4))
    make_importance_plot(data, col)

The x-axis in each plot represents the rating given by the respondents for the importance of the given area. The y-axis represents the number of responses for each rating. The majority of respondents left high ratings (4 or 5) for most areas, except for software engineering which more diverse opinons and picked at a rating of 3.

### Average importance ratings for different Data Science aspects

To be able to better compare respondents' opinions on the different areas of data science, we will make a single bar plot that shows the average importance ratings for the 6 above aspects.   

In [None]:
# Data preparation for bar plot
import_means = data[IMPORT_AREAS].mean()
# Change index names for better readability
import_means.index = import_means.index.str.replace('IMPORT_', '').str.replace('_', ' ').str.lower()

# Creating the bar plot
plt.figure(figsize = (8, 4))
sns.barplot(x = import_means.index, y = import_means.values)
plt.title('Average importance ratings for different Data Science aspects')
plt.ylabel('Average rating')
plt.xticks(rotation = 45)  # Rotate labels for better readability
plt.show()

## *STOP PLEASE. THE FOLLOWING IS FOR THE NEXT EXERCISE. THANKS.*


### TODO: Make bar charts of known and future industries

In [None]:
# TODO: replace the content of this cell with your Python solution
raise NotImplementedError

### 3) Making a scatterplot

Finally, let's make a scatterplot to show the relationship between professional and programming experience.

In [None]:
# Creating the scatterplot for professional vs programming experience
#plt.figure(figsize=(12, 8))
sns.scatterplot(data=data, x="BACKGROUND_YEARS_PROFESSIONAL", y="BACKGROUND_YEARS_PROGRAMMING")
plt.title('Professional vs programming experience')
plt.xlabel('Years of professional experience')
plt.ylabel('Years of programming experience')
plt.show()

There seems to be a positive relationship between professional experience and programming experience, as respondents with more years of professional experience tend to also have more programming experience. Also, most respondents have less than 5 years of experience both professionally and in programming. There are also potentially 3 outliers, most notably the one with 40 years experience in both programming and professional experience (worth checking this respondent's data closely).

### 4) Visualising distributions with box plots

Mean and standard deviation are not informative for skewed data. `boxplot` is is a good visualisation for viewing and comparing distributions. It also shows outliers, e.g., values greater than `Q3+1.5*IQR` or less than `Q1-1.5*IQR`.

In [None]:
# Setting up the data for side-by-side boxplots
experience_data = data[['BACKGROUND_YEARS_PROFESSIONAL', 'BACKGROUND_YEARS_PROGRAMMING']].melt(var_name='EXPERIENCE_TYPE', value_name='YEARS')

# Creating side-by-side boxplots
plt.figure(figsize=(7, 4))
sns.boxplot(data=experience_data, x='EXPERIENCE_TYPE', y='YEARS', color='lightgrey')
plt.title('Professional vs programming experience')
plt.xticks([0, 1], ['Professional', 'Programming'])
plt.xlabel('Type of experience')
plt.ylabel('Years experience')
plt.show()


Comparing the medians suggest that, on average, respondents have more professional experience than programming experience. In addition, the professional experience is more spread out than the programming experience, with a larger interquartile range. Finally, there are outliers in both distributions, with some individuals having exceptionally high years of either professional or programming experience. This is more pronounced in programming experience, suggesting a few respondents have been programming for an unusually long time.

### 5) Calculating correlation between two variables

Pearson's r is the covariance of the two variables divided by the product of their standard deviations. Spearman rho is a common nonparametric test that is used instead of Pearson's r when 

In [None]:
# Calculating Pearson correlation between years of professional experience and years of programming experience
pearson_corr = data['BACKGROUND_YEARS_PROFESSIONAL'].corr(data['BACKGROUND_YEARS_PROGRAMMING'], method='pearson')
Spearman_corr = data['BACKGROUND_YEARS_PROFESSIONAL'].corr(data['BACKGROUND_YEARS_PROGRAMMING'], method='spearman')

print('Pearson correlation coefficient: {:.2f}'.format(pearson_corr))
print('Spearman correlation coefficient: {:.2f}'.format(Spearman_corr))

Calculate Kendall's tau between importance ratings

In [None]:
# Initialize an empty matrix for the correlation values
kendall_corr_matrix = np.zeros((len(IMPORT_AREAS), len(IMPORT_AREAS)))

# Calculate Kendall's tau for each pair of importance variables
for i, col1 in enumerate(IMPORT_AREAS):
    for j, col2 in enumerate(IMPORT_AREAS):
        tau, _ = kendalltau(data[col1], data[col2])
        kendall_corr_matrix[i, j] = tau

# Defining more meaningful labels for the variables
meaningful_labels = {
    "IMPORT_DATA_MANAGEMENT": "Data Management",
    "IMPORT_STATISTICS": "Statistics",
    "IMPORT_VISUALISATION": "Visualisation",
    "IMPORT_MACHINE_LEARNING": "Machine Learning",
    "IMPORT_SOFTWARE_ENGINEERING": "Software Engineering",
    "IMPORT_COMMUNICATION": "Communication"
}

# Convert the matrix to a DataFrame for better readability, using meaningful labels
kendall_corr_df = pd.DataFrame(kendall_corr_matrix, 
                               index = meaningful_labels.values(), 
                               columns = meaningful_labels.values())
kendall_corr_df

We can also visualise the correlation matrix as a heatmap as follows:

In [None]:
# Creating a heatmap of the Kendall's tau correlation matrix
plt.figure(figsize=(7, 5))
sns.heatmap(kendall_corr_df, annot=True, cmap='coolwarm', fmt=".2f", linewidths=.5) 
# rotate the x-axis labels 45 degrees
plt.xticks(rotation=45)
plt.title('Heatmap of Kendall\'s tau correlation matrix for the importance variables')
plt.show()

All correlations are positive and particularly strong for statistics and machine learning, and statistics and visualisation. This means that people who rate statistics as important also tend to rate the other two aspects as important. It is also worth highlighting that machine learning and communication are weakly correlated (tau = 0.07)

### 6) Visualising text data

#### Extracting words and counting their frequencies

We will break the texts under the "data science definition" column into words. During the process, we will also remove stop words, which are words that are commonly used in English but do not add a strong meaning, such as "the", "a", "an", "in", "at", "him", "she" , etc. 

In [None]:
# Stop words from # http://www.nltk.org/book/ch02.html#stopwords_index_term
# We use frozenset to prevent changes to the set of stop words
STOP_WORDS = frozenset([ 
    'i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', 'your', 'yours',
    'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', 'her', 'hers',
    'herself', 'it', 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves',
    'what', 'which', 'who', 'whom', 'this', 'that', 'these', 'those', 'am', 'is', 'are',
    'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does',
    'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until',
    'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into',
    'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down',
    'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here',
    'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more',
    'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so',
    'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don', 'should', 'now'
    ])

# Preprocessing the text data and counting word occurrences
words = " ".join(data['GOALS_DEFINITION'].dropna().str.lower())  # Combine all text entries into one large string
words = re.findall(r'\b\w+\b', words)  # Extract words using regex (\b matches word boundaries)

# Remove stop words and count word occurrences
filtered_words = [word for word in words if word.lower() not in STOP_WORDS]
filtered_word_count = Counter(filtered_words)
most_common_filtered_words = filtered_word_count.most_common(20) # Get the 20 most common words

# Preparing data for the bar chart
filtered_terms, filtered_freqs = zip(*most_common_filtered_words)  # Unzipping the terms and their frequencies

# Converting terms and their frequencies to list format for plotting
filtered_terms_list = list(filtered_terms)
filtered_freqs_list = list(filtered_freqs)

#### Plotting term frequencies across data science definitions

Now we can build a simple horizontal bar chart that displays the 20 most common terms across data science definitions.

In [None]:
# Creating a horizontal bar chart for the filtered terms
plt.figure(figsize=(8, 5))
sns.barplot(x=filtered_freqs_list, y=filtered_terms_list, color='grey')
plt.title('Top 20 most common terms in Data Science definition')
plt.xlabel('Frequency')
plt.ylabel('Terms')
plt.grid(axis = 'both', alpha = 0.5) # Add grid lines to the plot in both directions
plt.show()


### Alternative plot: Word Cloud

In [None]:
# Combining all the filtered terms into one large string
filtered_text = " ".join(filtered_words)

# Generating a word cloud
wordcloud = WordCloud(width = 800, height = 400, background_color ='white').generate(filtered_text)

# Displaying the word cloud
plt.figure(figsize = (11, 6))
plt.imshow(wordcloud, interpolation = 'bilinear')
plt.axis('off')
plt.show()

### 7) bar plots of the frequencies of the background and desired skills 

Let's now build two horizontal bar charts that show the frequencies of the background and desired skills

In [None]:
# Function to process and count skills from a column
def process_skills_column(skills_column, skills_to_consider):
    """
    Processes and count skills. The function encodes unlisted skills as 'Other'.
    
    Args:
        skills_column (series): A column containing skills data.
        skills_to_consider (list): A list of skills to consider.

    Returns:
        skill_counts (counter): A Counter object with skills and their counts.
    """

    
    processed_skills_list = []
    
    for skills_str in skills_column.dropna():
        individual_skills = skills_str.split(', ')
        # Encoding skills as "Other" if they are not in the skills_to_consider list
        processed_skills = [skill if skill in skills_to_consider else 'Other' for skill in individual_skills]
        processed_skills_list.extend(processed_skills)
    
    # Counting the occurrences of each skill
    skill_counts = Counter(processed_skills_list)
    return skill_counts

# Defining the skills to consider
skills_used = [
    "Relational databases", "NoSQL", "Information Retrieval", "Statistical Analysis", 
    "Visualisation", "Machine learning", "Data mining", "Natural Language Processing", 
    "Programming", "Customer Relationship Management", "Management", "Requirements gathering", 
    "Ethics", "Software Engineering", "Product-driven thinking"
]

# Processing the skills data with encoding unlisted skills as "Other"
background_skill_counts = process_skills_column(data['BACKGROUND_SKILLS'], skills_used)
goals_skill_counts = process_skills_column(data['GOALS_SKILLS'], skills_used)

In [None]:
def plot_skill_distribution(skill_counts, title):
    """Plots a bar chart for the distribution of skills, ordered by frequency.
    
    Args:
        skill_counts (counter): A counter object with skills and their counts.
        title (str): The title for the plot.
    """
    
    # Sorting the skills by frequency
    sorted_skills = {k: v for k, v in sorted(skill_counts.items(), key=lambda item: item[1], reverse=True)}
    skills, counts = zip(*sorted_skills.items())  # Unzipping the sorted skills and their counts

    # Creating the bar plot
    plt.figure(figsize=(9, 6))
    sns.barplot(x = list(counts), y = list(skills), palette = "Blues_d")
    plt.title(title)
    plt.xlabel('Number of respondents')
    plt.ylabel('Skills')
    plt.show()

# Plotting the skill distributions with sorted bars
plot_skill_distribution(background_skill_counts, 'Distribution of background skills')
plot_skill_distribution(goals_skill_counts, 'Distribution of desired skills')
