# Voice Based Server Creation Project

> Interface Notebook

---
# Speech to Text

In [None]:
# @title Configuration and Install dependencies
# @markdown There may be *errors* and/or *warnings* reported during the installation. However, they are to be ignored.
!pip install -U openai-whisper --quiet
!sudo apt update && sudo apt install ffmpeg --quiet
import whisper
model = whisper.load_model("large")

In [None]:
# @title Record Your Command
# @markdown Click on 'Start Recording' to begin. After you finish your command, click on the 'Stop Recording' button.

from IPython.display import Javascript, display
from google.colab import output
from base64 import b64decode

# JavaScript to record audio with start and stop buttons
record_js_code = """
function recordAudio(callback) {
    const startButton = document.getElementById('startButton');
    startButton.style.display = 'none';  // Hide the start button after clicking

    navigator.mediaDevices.getUserMedia({audio: true}).then(stream => {
        const mediaRecorder = new MediaRecorder(stream);
        let audioChunks = [];

        mediaRecorder.addEventListener("dataavailable", event => {
            audioChunks.push(event.data);
        });

        const stopButton = document.createElement('button');
        stopButton.innerHTML = '<span style="color: red;">&#x25A0;</span> Stop Recording';
        stopButton.style = 'font-size: 16px; padding: 10px; margin-top: 5px;';
        stopButton.onclick = function() {
            stopButton.style.display = 'none';  // Hide the stop button after clicking
            mediaRecorder.stop();
            stream.getTracks().forEach(track => track.stop()); // Stop the media stream
        }
        document.body.appendChild(stopButton);

        mediaRecorder.start();

        mediaRecorder.onstop = () => {
            const audioBlob = new Blob(audioChunks, {type: 'audio/wav'});
            const audioUrl = URL.createObjectURL(audioBlob);
            const audioElement = document.createElement('audio');
            audioElement.src = audioUrl;
            audioElement.controls = true;
            document.body.appendChild(audioElement);

            const reader = new FileReader();
            reader.readAsDataURL(audioBlob);
            reader.onloadend = () => {
                callback(reader.result.split(',')[1]);
            };
        };
    });
}

const startButton = document.createElement('button');
startButton.innerHTML = '<span style="color: green;">&#x25B6;</span> Start Recording';
startButton.style = 'font-size: 16px; padding: 10px; margin-top: 5px;';
startButton.id = 'startButton';
startButton.onclick = () => recordAudio(base64Data => google.colab.kernel.invokeFunction('notebook.record_audio', [base64Data], {}));
document.body.appendChild(startButton);
"""

def record_audio(base64_audio):
    audio_data = b64decode(base64_audio)
    if audio_data:
        print("Audio data received:", len(audio_data), "bytes")
        with open('recording.wav', 'wb') as file:
            file.write(audio_data)
    else:
        print("No audio data received.")

output.register_callback('notebook.record_audio', record_audio)

display(Javascript(record_js_code))


<IPython.core.display.Javascript object>

In [None]:
# @title Speech to Text Model
# @markdown When you run this cell, you will be able to see the transcription of your voice. For faster results, ensure that you are using a GPU-enabled Colab environment.
result = model.transcribe("recording.wav")
print(result["text"])
command = result["text"]

---
# Paragraph Analysis

In [None]:
# @title Configuration and Install dependencies
# @markdown There may be *errors* and/or *warnings* reported during the installation. However, they are to be ignored.
!pip install gdown --quiet
#v1
#!gdown --id 1--_eIfYcSDfHS5gLYSQh0ibDKh7zQ4v1 --quiet
#v2
!gdown --id 1zUcqlkAXMorK_ca4mKmwxrydMpElYDA8 --quiet

!pip install torch --quiet
!pip install pandas --quiet
!pip install numpy --quiet
import torch
import pandas as pd
import numpy as np
from transformers import BertTokenizer, BertTokenizerFast
tokenizer = BertTokenizerFast.from_pretrained('bert-base-cased')
# model = torch.load('model_full.pth')

if torch.cuda.is_available():
    device = torch.device("cuda")
    print( torch.cuda.device_count())
    print('Available:', torch.cuda.get_device_name(0))
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

model = torch.load('model_full-v2.pth', map_location=torch.device('cpu'))
#label v1
# label_map = {'B-PROTOCOL': 0,
#  'B-PLAN': 1,
#  'I-PROTOCOL': 2,
#  'B-FIAREWALL': 3,
#  'B-FIREWALL': 4,
#  'B-DISK': 5,
#  'B-SSH': 6,
#  'B-OS': 7,
#  'I-DISK': 8,
#  'B-EXTRA_DISK': 9,
#  'I-PLAN': 10,
#  'I-DATACENTER': 11,
#  'I-CPU': 12,
#  'B-RAM': 13,
#  'B-CPU': 14,
#  'B-BACKUP': 15,
#  'I-OS': 16,
#  'I-RAM': 17,
#  'O': 18,
#  'B-SERVER': 19,
#  'I-SSH': 20,
#  'B-NETWORK': 21,
#  'B-DATACENTER': 22}

#label v2
label_map = {'B-SSH': 0,
 'I-RAM': 1,
 'B-EXTRA_DISK_SIZE': 2,
 'B-DATACENTER': 3,
 'B-BACKUP': 4,
 'I-EXTRA_DISK_SIZE': 5,
 'B-NETWORK': 6,
 'I-OS': 7,
 'B-DISK': 8,
 'B-FIREWALL': 9,
 'B-OS': 10,
 'I-SSH': 11,
 'B-FIAREWALL': 12,
 'I-PROTOCOL': 13,
 'B-CPU': 14,
 'I-PLAN': 15,
 'B-EXTRA_DISK': 16,
 'I-CPU': 17,
 'I-DATACENTER': 18,
 'B-PLAN': 19,
 'B-RAM': 20,
 'I-DISK': 21,
 'B-PROTOCOL': 22,
 'B-SERVER': 23,
 'O': 24}





tokenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

No GPU available, using the CPU instead.


In [None]:
# @title Select Your Paragraph for Analysis
# @markdown Do you want to use a transcription of your voice, or would you prefer to enter a new paragraph?
from google.colab import output
from IPython.display import display, HTML
import json

# JavaScript for handling button clicks
button_html = """
<button id="voiceButton">Use Transcription of Voice</button>
<button id="customTextButton">Use Your Custom Paragraph</button>
<div id="textInput" style="display:none;">
  <input type="text" id="inputText" placeholder="Enter Paragraph">
  <button onclick="setText()">Submit</button>
</div>
<script>
  document.querySelector("#voiceButton").onclick = () => {
    google.colab.kernel.invokeFunction('notebook.set_test_sentence', ['voice'], {});
  };
  document.querySelector("#customTextButton").onclick = () => {
    document.querySelector("#textInput").style.display = "block";
  };
  setText = () => {
    let text = document.querySelector("#inputText").value;
    google.colab.kernel.invokeFunction('notebook.set_test_sentence', [text], {});
  };
</script>
"""

# Function to set the test_sentence in Python
def set_test_sentence(choice):
    global test_sentence
    if choice == "voice":
        # Code to get the transcription of the voice
        test_sentence = command  # Replace with actual transcription
    else:
        test_sentence = choice  # Custom text entered by the user
    print(f"test_sentence set to: {test_sentence}")

output.register_callback('notebook.set_test_sentence', set_test_sentence)

# Display the HTML buttons
display(HTML(button_html))



test_sentence set to:  Please create a server with 50GB disk.
test_sentence set to:  Please create a server with 50GB disk.


In [None]:
# @title Show Result
# @markdown you can see the output JSON file after run this cell.
#Encoding and convert the sentences into tensors
sample_sentence = tokenizer.encode(test_sentence)
sample_input_ids = torch.tensor([sample_sentence])
# sample_input_ids = torch.tensor([sample_sentence]).cuda()

#Predicting the test data set using model() function
with torch.no_grad():

    output = model(sample_input_ids)
label_indices = np.argmax(output[0].to('cpu').numpy(), axis=2)

#Function which retrieves key value for our Label Dictionary
def get_key(val):
    for key, value in label_map.items():
         if val == value:
             return key

    return "key doesn't exist"

#Tokenize
tokens = tokenizer.convert_ids_to_tokens(sample_input_ids.to('cpu').numpy()[0])
new_tokens, new_label = [], []
for token, label_idx in zip(tokens, label_indices[0]):
    if token.startswith("##"):
        new_tokens[-1] = new_tokens[-1] + token[2:]
    else:
        new_label.append(get_key(label_idx))
        new_tokens.append(token)

#Appending Tokens and Labels
movie_token=[]
movie_label=[]
for token, label in zip(new_tokens, new_label):
    movie_token.append(token)
    movie_label.append(label)

df=pd.DataFrame({"Token":movie_token,"Label":movie_label})
df
# df['Simplified_Label'] = df['Label'].str.replace(r'^(B-|I-)', '', regex=True)
# simplified_grouped_tokens = df[df['Label'] != 'O'].groupby('Simplified_Label')['Token'].apply(list).to_dict()
# simplified_grouped_tokens_concatenated = {
#     label: ' '.join(tokens) for label, tokens in simplified_grouped_tokens.items()
# }
# for label in simplified_grouped_tokens_concatenated:
#     simplified_grouped_tokens_concatenated[label] = simplified_grouped_tokens_concatenated[label].replace(' - ', '-')
#     simplified_grouped_tokens_concatenated[label] = simplified_grouped_tokens_concatenated[label].replace(' . ', '.')
# updated_grouped_json_concatenated_data = json.dumps(simplified_grouped_tokens_concatenated, indent=4)
# json_file_path = 'json_file.json'
# with open(json_file_path, 'w') as json_file:
#     json_file.write(updated_grouped_json_concatenated_data)

# import json

# # Replace 'your_file.json' with the path to your JSON file
# file_path = 'json_file.json'

# # Read the JSON file
# with open(file_path, 'r') as file:
#     data = json.load(file)

# # Display the contents of the JSON file
# print(json.dumps(data, indent=4))
import re
import pandas as pd

# Load the dataframe from the uploaded file
# df = pd.read_csv('/path/to/your/file.csv')  # Replace with your file path

# Initialize the JSON object
features_json = {
    'SSH': None, 'RAM': None, 'EXTRA_DISK_SIZE': None, 'DATACENTER': None,
    'BACKUP': None, 'NETWORK': None, 'OS': None, 'DISK': None,
    'FIREWALL': None, 'CPU': None, 'PLAN': None, 'EXTRA_DISK': None,
    'SERVER': None, 'PROTOCOL': None
}

# Function to extract and concatenate tokens for each feature
def extract_feature_tokens(df, feature_label):
    return ' '.join(df[df['Label'].str.contains(feature_label)]['Token'])

# Populate features_json with feature data
for feature in features_json.keys():
    features_json[feature] = extract_feature_tokens(df, feature)

# Regular expression for extracting digits
digit_extractor = re.compile(r'\d+')

# Function to standardize and process feature data
def process_feature(feature_key, lower=True, replace_dict=None, digit_only=False):
    feature = features_json[feature_key]
    if feature is not None:
        if lower:
            feature = feature.lower()
        if replace_dict:
            for key, val in replace_dict.items():
                feature = feature.replace(key, val)
        if digit_only:
            matches = digit_extractor.findall(feature)
            feature = int(matches[0]) if matches else None
        features_json[feature_key] = feature

# Define replacements for specific features
replacements = {
    'DATACENTER': {'data center': ''},
    'PLAN': {'plan': ''}
}

# Process each feature as required
for feature, params in replacements.items():
    process_feature(feature, replace_dict=params)

process_feature('SERVER')
process_feature('EXTRA_DISK')
process_feature('OS')
process_feature('SSH')
process_feature('FIREWALL')

# Processing features that need digit extraction
for feature in ['CPU', 'RAM', 'EXTRA_DISK_SIZE', 'DISK']:
    process_feature(feature, digit_only=True)

# Specific logic for certain features
def adjust_protocol():
    protocol = features_json['PROTOCOL']
    if protocol:
        protocol = 'IPV4' if 'ipv4' in protocol else 'IPV6' if 'ipv6' in protocol else protocol
        features_json['PROTOCOL'] = protocol

def adjust_network():
    network = features_json['NETWORK']
    if network:
        network = 'public' if 'public' in network else 'private' if 'private' in network else network
        features_json['NETWORK'] = network

def adjust_backup():
    backup = features_json['BACKUP']
    if backup:
        for period in ['daily', 'weekly', 'monthly']:
            if period in backup:
                features_json['BACKUP'] = period
                break

adjust_protocol()
adjust_network()
adjust_backup()

features_json


{'SSH': '',
 'RAM': None,
 'EXTRA_DISK_SIZE': None,
 'DATACENTER': '',
 'BACKUP': '',
 'NETWORK': '',
 'OS': '',
 'DISK': 50,
 'FIREWALL': '',
 'CPU': None,
 'PLAN': '',
 'EXTRA_DISK': '',
 'SERVER': '',
 'PROTOCOL': ''}