216 changes: 216 additions & 0 deletions boards/milkymist-one/rtl/system.v
@@ -0,0 +1,216 @@
/*
* Milkyminer
*
* Copyright (c) 2011 fpgaminer@bitcoin-mining.com
* Copyleft 2012 paul@kristianpaul.org
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, version 3 of the License.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/

`timescale 1ns/1ps

module system(
input clk50,

// UART
input uart_rx,
output uart_tx,

// GPIO
// input btn1,
// input btn2,
// input btn3,
output led1,
output led2

// Exp miner
//output [EXT_PORTS-1:0] extminer_txd,
//input [EXT_PORTS-1:0] extminer_rxd,

// Expansion connector
// input [11:0] exp

//DIP

);


// The LOOP_LOG2 parameter determines how unrolled the SHA-256
// calculations are. For example, a setting of 1 will completely
// unroll the calculations, resulting in 128 rounds and a large, fast
// design.
//
// A setting of 2 will result in 64 rounds, with half the size and
// half the speed. 3 will be 32 rounds, with 1/4th the size and speed.
// And so on.
//
// Valid range: [0, 5]
`ifdef CONFIG_LOOP_LOG2
parameter LOOP_LOG2 = `CONFIG_LOOP_LOG2;
`else
parameter LOOP_LOG2 = 5;
`endif

// No need to adjust these parameters
localparam [5:0] LOOP = (6'd1 << LOOP_LOG2);
// The nonce will always be larger at the time we discover a valid
// hash. This is its offset from the nonce that gave rise to the valid
// hash (except when LOOP_LOG2 == 0 or 1, where the offset is 131 or
// 66 respectively).
localparam [31:0] GOLDEN_NONCE_OFFSET = (32'd1 << (7 - LOOP_LOG2)) + 32'd1;

////
reg [255:0] state = 0;
reg [511:0] data = 0;
reg [31:0] nonce = 32'h00000000;

//// DCM
wire hash_clk;
wire hash_clk_dcm;
// 50/5*8 = 80Mhz

DCM_SP #(
.CLKDV_DIVIDE(2.0), // 1.5,2.0,2.5,3.0,3.5,4.0,4.5,5.0,5.5,6.0,6.5

.CLKFX_DIVIDE(5), // 1 to 32
.CLKFX_MULTIPLY(8), // 2 to 32

.CLKIN_DIVIDE_BY_2("FALSE"),
.CLKIN_PERIOD(20.0),
.CLKOUT_PHASE_SHIFT("NONE"),
.CLK_FEEDBACK("NONE"),
.DESKEW_ADJUST("SYSTEM_SYNCHRONOUS"),
.DUTY_CYCLE_CORRECTION("TRUE"),
.PHASE_SHIFT(0),
.STARTUP_WAIT("TRUE")
) clkgen_hash (
.CLK0(),
.CLK90(),
.CLK180(),
.CLK270(),

.CLK2X(),
.CLK2X180(),

.CLKDV(),
.CLKFX(hash_clk_dcm),
.CLKFX180(),
.LOCKED(),
.CLKFB(),
.CLKIN(clk50),
.RST(1'b0),
.PSEN(1'b0)
);
BUFG b1(
.I(hash_clk_dcm),
.O(hash_clk)
);

//// Hashers
wire [255:0] hash, hash2;
reg [5:0] cnt = 6'd0;
reg feedback = 1'b0;

sha256_transform #(.LOOP(LOOP)) uut (
.clk(hash_clk),
.feedback(feedback),
.cnt(cnt),
.rx_state(state),
.rx_input(data),
.tx_hash(hash)
);
sha256_transform #(.LOOP(LOOP)) uut2 (
.clk(hash_clk),
.feedback(feedback),
.cnt(cnt),
.rx_state(256'h5be0cd191f83d9ab9b05688c510e527fa54ff53a3c6ef372bb67ae856a09e667),
.rx_input({256'h0000010000000000000000000000000000000000000000000000000080000000, hash}),
.tx_hash(hash2)
);


//// Virtual Wire Control
reg [255:0] midstate_buf = 0, data_buf = 0;
wire [255:0] midstate_vw, data2_vw;


serial_receive serrx (.clk(hash_clk), .RxD(uart_rx), .midstate(midstate_vw), .data2(data2_vw));

//// Virtual Wire Output
reg [31:0] golden_nonce = 0;
reg serial_send;
wire serial_busy;

serial_transmit sertx (.clk(hash_clk), .TxD(uart_tx), .send(serial_send), .busy(serial_busy), .word(golden_nonce));


//// Control Unit
reg is_golden_ticket = 1'b0;
reg feedback_d1 = 1'b1;
wire [5:0] cnt_next;
wire [31:0] nonce_next;
wire feedback_next;
wire reset;
assign reset = 1'b0;

assign cnt_next = reset ? 6'd0 : (LOOP == 1) ? 6'd0 : (cnt + 6'd1) & (LOOP-1);
// On the first count (cnt==0), load data from previous stage (no feedback)
// on 1..LOOP-1, take feedback from current stage
// This reduces the throughput by a factor of (LOOP), but also reduces the design size by the same amount
assign feedback_next = (LOOP == 1) ? 1'b0 : (cnt_next != {(LOOP_LOG2){1'b0}});
assign nonce_next =
reset ? 32'd0 :
feedback_next ? nonce : (nonce + 32'd1);


always @ (posedge hash_clk)
begin
midstate_buf <= midstate_vw;
data_buf <= data2_vw;

cnt <= cnt_next;
feedback <= feedback_next;
feedback_d1 <= feedback;

// Give new data to the hasher
state <= midstate_buf;
data <= {384'h000002800000000000000000000000000000000000000000000000000000000000000000000000000000000080000000, nonce_next, data_buf[95:0]};
nonce <= nonce_next;


// Check to see if the last hash generated is valid.
is_golden_ticket <= (hash2[255:224] == 32'h00000000) && !feedback_d1;
if(is_golden_ticket)
begin
// TODO: Find a more compact calculation for this
if (LOOP == 1)
golden_nonce <= nonce - 32'd131;
else if (LOOP == 2)
golden_nonce <= nonce - 32'd66;
else
golden_nonce <= nonce - GOLDEN_NONCE_OFFSET;

if (!serial_busy) serial_send <= 1;
end // if (is_golden_ticket)
else
serial_send <= 0;

end

/* debug led serial */
//assign led1 = ~uart_rx;
assign led1 = |golden_nonce;
assign led2 = ~uart_tx;

endmodule

5 changes: 5 additions & 0 deletions boards/milkymist-one/sources.mak
@@ -0,0 +1,5 @@
BOARD_SRC=$(wildcard $(BOARD_DIR)/*.v)

MINER_SRC=$(wildcard $(CORES_DIR)/fpgaminer/rtl/*.v)

CORES_SRC=$(MINER_SRC)
31 changes: 31 additions & 0 deletions boards/milkymist-one/synthesis/Makefile.synplify
@@ -0,0 +1,31 @@
BOARD_DIR=../rtl
CORES_DIR=../../../cores

include common.mak

include ../sources.mak
SRC=$(BOARD_SRC) $(CORES_SRC)

$(BUILDDIR)/system.ucf: common.ucf synplify.ucf
cat common.ucf synplify.ucf > $(BUILDDIR)/system.ucf

$(BUILDDIR)/loadsources.tcl: $(SRC)
rm -f $(BUILDDIR)/loadsources.tcl
for i in `echo $^`; do \
echo "add_file -verilog -lib work \"../$$i\"" >> $(BUILDDIR)/loadsources.tcl; \
done

$(BUILDDIR)/system_ucf.tcl: $(BUILDDIR)/system.ucf
cd $(BUILDDIR) && ucf2mgc system.ucf

$(BUILDDIR)/synplicity.ucf: $(BUILDDIR)/system.edf

$(BUILDDIR)/synplify.prj: synplify.prj
cp synplify.prj $(BUILDDIR)/synplify.prj

$(BUILDDIR)/system.edf: $(BUILDDIR)/loadsources.tcl $(BUILDDIR)/synplify.prj $(BUILDDIR)/system_ucf.tcl
cd $(BUILDDIR) && synplify_pro -batch synplify.prj

$(BUILDDIR)/system.ngd: $(BUILDDIR)/system.edf $(BUILDDIR)/synplicity.ucf
cd $(BUILDDIR) && ngdbuild -uc synplicity.ucf system.edf

22 changes: 22 additions & 0 deletions boards/milkymist-one/synthesis/Makefile.xst
@@ -0,0 +1,22 @@
BOARD_DIR=../rtl
CORES_DIR=../../../cores

include common.mak

include ../sources.mak
SRC=$(BOARD_SRC) $(CORES_SRC)

$(BUILDDIR)/system.ucf: common.ucf xst.ucf
cat common.ucf xst.ucf > $(BUILDDIR)/system.ucf

$(BUILDDIR)/system.prj: $(SRC)
rm -f $(BUILDDIR)/system.prj
for i in `echo $^`; do \
echo "verilog work ../$$i" >> $(BUILDDIR)/system.prj; \
done

build/system.ngc: build/system.prj
cd build && xst -ifn ../system.xst

$(BUILDDIR)/system.ngd: $(BUILDDIR)/system.ngc $(BUILDDIR)/system.ucf
cd $(BUILDDIR) && ngdbuild -uc system.ucf system.ngc
Empty file.
68 changes: 68 additions & 0 deletions boards/milkymist-one/synthesis/common.mak
@@ -0,0 +1,68 @@
MMDIR?=../../..

BUILDDIR=build

all: $(BUILDDIR)/system.bit $(BUILDDIR)/system.fpg

timing: $(BUILDDIR)/system-routed.twr

usage: $(BUILDDIR)/system-routed.xdl
../../../tools/xdlanalyze.pl $(BUILDDIR)/system-routed.xdl 0

load: $(BUILDDIR)/system.bit
jtag -n load.batch

# Sometimes different options are needed to meet timing FIXME
build/system.ncd: build/system.ngd
cd build && map -ol high -t 2 -w system.ngd

build/system-routed.ncd: build/system.ncd
cd build && par -ol high -w system.ncd system-routed.ncd

$(BUILDDIR)/system.bit: $(BUILDDIR)/system-routed.ncd
cd $(BUILDDIR) && bitgen -g LCK_cycle:6 -g Binary:Yes -g INIT_9K:Yes -w system-routed.ncd system.bit

$(BUILDDIR)/system.bin: $(BUILDDIR)/system.bit

$(BUILDDIR)/system.fpg: $(BUILDDIR)/system.bin
$(MMDIR)/tools/byteswap $(BUILDDIR)/system.bin $(BUILDDIR)/system.fpg

$(BUILDDIR)/system-routed.xdl: $(BUILDDIR)/system-routed.ncd
cd $(BUILDDIR) && xdl -ncd2xdl system-routed.ncd system-routed.xdl

$(BUILDDIR)/system-routed.twr: $(BUILDDIR)/system-routed.ncd
cd $(BUILDDIR) && trce -e 100 system-routed.ncd system.pcf

# MPPR targets
mppr: $(BUILDDIR)/system-routed0.ncd $(BUILDDIR)/system-routed1.ncd $(BUILDDIR)/system-routed2.ncd $(BUILDDIR)/system-routed3.ncd $(BUILDDIR)/system-routed4.ncd $(BUILDDIR)/system-routed5.ncd

$(BUILDDIR)/system0.ncd: $(BUILDDIR)/system.ngd
cd $(BUILDDIR) && map -ol high -t 1 -w system.ngd -o system0.ncd
$(BUILDDIR)/system1.ncd: $(BUILDDIR)/system.ngd
cd $(BUILDDIR) && map -ol high -t 20 -w system.ngd -o system1.ncd
$(BUILDDIR)/system2.ncd: $(BUILDDIR)/system.ngd
cd $(BUILDDIR) && map -ol high -t 40 -w system.ngd -o system2.ncd
$(BUILDDIR)/system3.ncd: $(BUILDDIR)/system.ngd
cd $(BUILDDIR) && map -ol high -t 60 -w system.ngd -o system3.ncd
$(BUILDDIR)/system4.ncd: $(BUILDDIR)/system.ngd
cd $(BUILDDIR) && map -ol high -t 80 -w system.ngd -o system4.ncd
$(BUILDDIR)/system5.ncd: $(BUILDDIR)/system.ngd
cd $(BUILDDIR) && map -ol high -t 99 -w system.ngd -o system5.ncd

$(BUILDDIR)/system-routed0.ncd: $(BUILDDIR)/system0.ncd
cd $(BUILDDIR) && par -ol high -w system0.ncd system-routed0.ncd > par0
$(BUILDDIR)/system-routed1.ncd: $(BUILDDIR)/system1.ncd
cd $(BUILDDIR) && par -ol high -w system1.ncd system-routed1.ncd > par1
$(BUILDDIR)/system-routed2.ncd: $(BUILDDIR)/system2.ncd
cd $(BUILDDIR) && par -ol high -w system2.ncd system-routed2.ncd > par2
$(BUILDDIR)/system-routed3.ncd: $(BUILDDIR)/system3.ncd
cd $(BUILDDIR) && par -ol high -w system3.ncd system-routed3.ncd > par3
$(BUILDDIR)/system-routed4.ncd: $(BUILDDIR)/system4.ncd
cd $(BUILDDIR) && par -ol high -w system4.ncd system-routed4.ncd > par4
$(BUILDDIR)/system-routed5.ncd: $(BUILDDIR)/system5.ncd
cd $(BUILDDIR) && par -ol high -w system5.ncd system-routed5.ncd > par5

clean:
rm -rf build/*

.PHONY: timing usage load mppr clean
41 changes: 41 additions & 0 deletions boards/milkymist-one/synthesis/common.ucf
@@ -0,0 +1,41 @@
# ==== Clock input ====
NET "clk50" LOC = AB11 | IOSTANDARD = LVCMOS33;

NET "clk50" TNM_NET = "GRPclk50";
TIMESPEC "TSclk50" = PERIOD "GRPCLK50" 20 ns HIGH 50%;

# ==== UART ====
NET "uart_rx" LOC = K18 | IOSTANDARD = LVCMOS33 | PULLUP;
NET "uart_tx" LOC = L17 | IOSTANDARD = LVCMOS33 | SLEW = SLOW;

# ==== Pushbuttons ====
#NET "btn1" LOC = AB4 | IOSTANDARD = LVCMOS33;
#NET "btn2" LOC = AA4 | IOSTANDARD = LVCMOS33;
#NET "btn3" LOC = AB5 | IOSTANDARD = LVCMOS33;

# ==== LEDs ====
NET "led1" LOC = B16 | IOSTANDARD = LVCMOS33 | SLEW = QUIETIO | DRIVE = 24;
NET "led2" LOC = A16 | IOSTANDARD = LVCMOS33 | SLEW = QUIETIO | DRIVE = 24;

# ==== Expansion connector ====
#NET "exp(0)" LOC = A20;
#NET "exp(1)" LOC = F16;
#NET "exp(2)" LOC = A21;
#NET "exp(3)" LOC = F17;
#NET "exp(4)" LOC = B21;
#NET "exp(5)" LOC = H16;
#NET "exp(6)" LOC = B22;
#NET "exp(7)" LOC = H17;
#NET "exp(8)" LOC = G16;
#NET "exp(9)" LOC = J16;
#NET "exp(10)" LOC = G17;
#NET "exp(11)" LOC = K16;

#NET "exp(*)" IOSTANDARD = LVCMOS33;

# avoid floating signals
#NET "exp(*)" PULLDOWN;

# ==== Timing fixes ====
NET "hash_clk" TNM_NET = hash_clk;
TIMESPEC TS_hash_clk = PERIOD "hash_clk" 12.5 ns HIGH 50 %;
1 change: 1 addition & 0 deletions boards/milkymist-one/synthesis/ioffs.sdc
@@ -0,0 +1 @@
define_global_attribute syn_useioff 1
5 changes: 5 additions & 0 deletions boards/milkymist-one/synthesis/load.batch
@@ -0,0 +1,5 @@
cable milkymist
detect
instruction CFG_OUT 000100 BYPASS
instruction CFG_IN 000101 BYPASS
pld load build/system.bit
38 changes: 38 additions & 0 deletions boards/milkymist-one/synthesis/synplify.prj
@@ -0,0 +1,38 @@
set_option -vlog_std v2001

source "loadsources.tcl"
#add_file "system_ucf.tcl"
add_file "../ioffs.sdc"

set_option -technology Spartan6
set_option -part XC6SLX45
set_option -package FGG484
set_option -speed_grade -2
set_option -part_companion ""

set_option -top_module system

set_option -symbolic_fsm_compiler 1

set_option -compiler_compatible 0
set_option -resource_sharing 1

set_option -run_prop_extract 1
set_option -maxfan 10000
set_option -disable_io_insertion 0
set_option -pipe 1
set_option -retiming 0
set_option -update_models_cp 0
set_option -fixgatedclocks 3
set_option -fixgeneratedclocks 3
set_option -no_sequential_opt 0
set_option -enable_prepacking 1

set_option -include_path "../../../../cores/pfpu/rtl/;../../rtl/"

set_option -write_verilog 0
set_option -write_vhdl 0
set_option -write_vif 1
set_option -write_apr_constraint 1

project -result_file "system.edf"
13 changes: 13 additions & 0 deletions boards/milkymist-one/synthesis/synplify.ucf
@@ -0,0 +1,13 @@
INST "clkgen_sys" LOC = DCM_X0Y1;
INST "b1" LOC = BUFGMUX_X2Y3;
INST "b2" LOC = BUFGMUX_X3Y5;
INST "ddram/clkgen_dqs" LOC = DCM_X0Y6;
INST "ddram/b1" LOC = BUFGMUX_X2Y4;
INST "ddram/b2" LOC = BUFGMUX_X2Y2;

INST "b_phy_tx_clk0" LOC = BUFIO2_X3Y10;
INST "b_phy_tx_clk" LOC = BUFGMUX_X3Y7;
INST "b_phy_rx_clk0" LOC = BUFIO2_X3Y11;
INST "b_phy_rx_clk" LOC = BUFGMUX_X3Y8;

#NET "ac97_clk" CLOCK_DEDICATED_ROUTE = FALSE;
11 changes: 11 additions & 0 deletions boards/milkymist-one/synthesis/system-rescue.xst
@@ -0,0 +1,11 @@
run
-ifn system.prj
-top system
-ifmt MIXED
-opt_mode SPEED
-opt_level 2
-resource_sharing no
-reduce_control_sets auto
-ofn system.ngc
-p xc6slx45-fgg484-2
-define RESCUE
10 changes: 10 additions & 0 deletions boards/milkymist-one/synthesis/system.xst
@@ -0,0 +1,10 @@
run
-ifn system.prj
-top system
-ifmt MIXED
-opt_mode SPEED
-opt_level 2
-resource_sharing no
-reduce_control_sets auto
-ofn system.ngc
-p xc6slx45-fgg484-2
1 change: 1 addition & 0 deletions boards/milkymist-one/synthesis/xst.ucf
@@ -0,0 +1 @@
CONFIG STEPPING = "ES";
94 changes: 94 additions & 0 deletions cores/fpgaminer/rtl/async_receiver.v
@@ -0,0 +1,94 @@
// RS-232 RX module
// (c) fpga4fun.com KNJN LLC - 2003, 2004, 2005, 2006

module async_receiver(clk, RxD, RxD_data_ready, RxD_data, RxD_endofpacket, RxD_idle);
input clk, RxD;
output RxD_data_ready; // onc clock pulse when RxD_data is valid
output [7:0] RxD_data;

parameter ClkFrequency = 80000000; // 25MHz
//parameter ClkFrequency = 25000000; // 25MHz
parameter Baud = 115200;

// We also detect if a gap occurs in the received stream of characters
// That can be useful if multiple characters are sent in burst
// so that multiple characters can be treated as a "packet"
output RxD_endofpacket; // one clock pulse, when no more data is received (RxD_idle is going high)
output RxD_idle; // no data is being received

// Baud generator (we use 8 times oversampling)
parameter Baud8 = Baud*8;
parameter Baud8GeneratorAccWidth = 16;
wire [Baud8GeneratorAccWidth:0] Baud8GeneratorInc = ((Baud8<<(Baud8GeneratorAccWidth-7))+(ClkFrequency>>8))/(ClkFrequency>>7);
reg [Baud8GeneratorAccWidth:0] Baud8GeneratorAcc;
always @(posedge clk) Baud8GeneratorAcc <= Baud8GeneratorAcc[Baud8GeneratorAccWidth-1:0] + Baud8GeneratorInc;
wire Baud8Tick = Baud8GeneratorAcc[Baud8GeneratorAccWidth];

////////////////////////////
reg [1:0] RxD_sync_inv;
always @(posedge clk) if(Baud8Tick) RxD_sync_inv <= {RxD_sync_inv[0], ~RxD};
// we invert RxD, so that the idle becomes "0", to prevent a phantom character to be received at startup

reg [1:0] RxD_cnt_inv;
reg RxD_bit_inv;

always @(posedge clk)
if(Baud8Tick)
begin
if( RxD_sync_inv[1] && RxD_cnt_inv!=2'b11) RxD_cnt_inv <= RxD_cnt_inv + 2'h1;
else
if(~RxD_sync_inv[1] && RxD_cnt_inv!=2'b00) RxD_cnt_inv <= RxD_cnt_inv - 2'h1;

if(RxD_cnt_inv==2'b00) RxD_bit_inv <= 1'b0;
else
if(RxD_cnt_inv==2'b11) RxD_bit_inv <= 1'b1;
end

reg [3:0] state;
reg [3:0] bit_spacing;

// "next_bit" controls when the data sampling occurs
// depending on how noisy the RxD is, different values might work better
// with a clean connection, values from 8 to 11 work
wire next_bit = (bit_spacing==4'd10);

always @(posedge clk)
if(state==0)
bit_spacing <= 4'b0000;
else
if(Baud8Tick)
bit_spacing <= {bit_spacing[2:0] + 4'b0001} | {bit_spacing[3], 3'b000};

always @(posedge clk)
if(Baud8Tick)
case(state)
4'b0000: if(RxD_bit_inv) state <= 4'b1000; // start bit found?
4'b1000: if(next_bit) state <= 4'b1001; // bit 0
4'b1001: if(next_bit) state <= 4'b1010; // bit 1
4'b1010: if(next_bit) state <= 4'b1011; // bit 2
4'b1011: if(next_bit) state <= 4'b1100; // bit 3
4'b1100: if(next_bit) state <= 4'b1101; // bit 4
4'b1101: if(next_bit) state <= 4'b1110; // bit 5
4'b1110: if(next_bit) state <= 4'b1111; // bit 6
4'b1111: if(next_bit) state <= 4'b0001; // bit 7
4'b0001: if(next_bit) state <= 4'b0000; // stop bit
default: state <= 4'b0000;
endcase

reg [7:0] RxD_data;
always @(posedge clk)
if(Baud8Tick && next_bit && state[3]) RxD_data <= {~RxD_bit_inv, RxD_data[7:1]};

reg RxD_data_ready, RxD_data_error;
always @(posedge clk)
begin
RxD_data_ready <= (Baud8Tick && next_bit && state==4'b0001 && ~RxD_bit_inv); // ready only if the stop bit is received
RxD_data_error <= (Baud8Tick && next_bit && state==4'b0001 && RxD_bit_inv); // error if the stop bit is not received
end

reg [4:0] gap_count;
always @(posedge clk) if (state!=0) gap_count<=5'h00; else if(Baud8Tick & ~gap_count[4]) gap_count <= gap_count + 5'h01;
assign RxD_idle = gap_count[4];
reg RxD_endofpacket; always @(posedge clk) RxD_endofpacket <= Baud8Tick & (gap_count==5'h0F);

endmodule
73 changes: 73 additions & 0 deletions cores/fpgaminer/rtl/async_transmitter.v
@@ -0,0 +1,73 @@
// RS-232 TX module
// (c) fpga4fun.com KNJN LLC - 2003, 2004, 2005, 2006

//`define DEBUG // in DEBUG mode, we output one bit per clock cycle (useful for faster simulations)

module async_transmitter(clk, TxD_start, TxD_data, TxD, TxD_busy);
input clk, TxD_start;
input [7:0] TxD_data;
output TxD, TxD_busy;

parameter ClkFrequency = 80000000; // 25MHz
parameter Baud = 115200;
parameter RegisterInputData = 1; // in RegisterInputData mode, the input doesn't have to stay valid while the character is been transmitted

// Baud generator
parameter BaudGeneratorAccWidth = 16;
reg [BaudGeneratorAccWidth:0] BaudGeneratorAcc;
`ifdef DEBUG
wire [BaudGeneratorAccWidth:0] BaudGeneratorInc = 17'h10000;
`else
wire [BaudGeneratorAccWidth:0] BaudGeneratorInc = ((Baud<<(BaudGeneratorAccWidth-4))+(ClkFrequency>>5))/(ClkFrequency>>4);
`endif

wire BaudTick = BaudGeneratorAcc[BaudGeneratorAccWidth];
wire TxD_busy;
always @(posedge clk) if(TxD_busy) BaudGeneratorAcc <= BaudGeneratorAcc[BaudGeneratorAccWidth-1:0] + BaudGeneratorInc;

// Transmitter state machine
reg [3:0] state;
wire TxD_ready = (state==0);
assign TxD_busy = ~TxD_ready;

reg [7:0] TxD_dataReg;
always @(posedge clk) if(TxD_ready & TxD_start) TxD_dataReg <= TxD_data;
wire [7:0] TxD_dataD = RegisterInputData ? TxD_dataReg : TxD_data;

always @(posedge clk)
case(state)
4'b0000: if(TxD_start) state <= 4'b0001;
4'b0001: if(BaudTick) state <= 4'b0100;
4'b0100: if(BaudTick) state <= 4'b1000; // start
4'b1000: if(BaudTick) state <= 4'b1001; // bit 0
4'b1001: if(BaudTick) state <= 4'b1010; // bit 1
4'b1010: if(BaudTick) state <= 4'b1011; // bit 2
4'b1011: if(BaudTick) state <= 4'b1100; // bit 3
4'b1100: if(BaudTick) state <= 4'b1101; // bit 4
4'b1101: if(BaudTick) state <= 4'b1110; // bit 5
4'b1110: if(BaudTick) state <= 4'b1111; // bit 6
4'b1111: if(BaudTick) state <= 4'b0010; // bit 7
4'b0010: if(BaudTick) state <= 4'b0011; // stop1
4'b0011: if(BaudTick) state <= 4'b0000; // stop2
default: if(BaudTick) state <= 4'b0000;
endcase

// Output mux
reg muxbit;
always @( * )
case(state[2:0])
3'd0: muxbit <= TxD_dataD[0];
3'd1: muxbit <= TxD_dataD[1];
3'd2: muxbit <= TxD_dataD[2];
3'd3: muxbit <= TxD_dataD[3];
3'd4: muxbit <= TxD_dataD[4];
3'd5: muxbit <= TxD_dataD[5];
3'd6: muxbit <= TxD_dataD[6];
3'd7: muxbit <= TxD_dataD[7];
endcase

// Put together the start, data and stop bits
reg TxD;
always @(posedge clk) TxD <= (state<4) | (state[3] & muxbit); // register the output to make it glitch free

endmodule
125 changes: 125 additions & 0 deletions cores/fpgaminer/rtl/serial.v
@@ -0,0 +1,125 @@
//`include "async_receiver.v"
//`include "async_transmitter.v"

// by teknohog, replaces virtual_wire by rs232

module serial_receive(clk, RxD, midstate, data2);
input clk;
input RxD;

wire RxD_data_ready;
wire [7:0] RxD_data;

async_receiver deserializer(.clk(clk), .RxD(RxD), .RxD_data_ready(RxD_data_ready), .RxD_data(RxD_data));

output [255:0] midstate;
output [255:0] data2;

// 256 bits midstate + 256 bits data at the same time = 64 bytes

// Might be a good idea to add some fixed start and stop sequences,
// so we really know we got all the data and nothing more. If a
// test for these fails, should ask for new data, so it needs more
// logic on the return side too. The check bits could be legible
// 7seg for quick feedback :)

reg [511:0] input_buffer;
reg [511:0] input_copy;
reg [6:0] demux_state = 7'b0000000;

assign midstate = input_copy[511:256];
assign data2 = input_copy[255:0];

// we probably don't need a busy signal here, just read the latest
// complete input that is available.

always @(posedge clk)
case (demux_state)
7'b1000000:
begin
input_copy <= input_buffer;
demux_state <= 0;
end

default:
if(RxD_data_ready)
begin
input_buffer <= input_buffer << 8;
input_buffer[7:0] <= RxD_data;
demux_state <= demux_state + 1;
end
endcase // case (demux_state)

endmodule // serial_receive

module serial_transmit (clk, TxD, busy, send, word);

// split 4-byte output into bytes

wire TxD_start;
wire TxD_busy;

reg [7:0] out_byte;
reg serial_start;
reg [3:0] mux_state = 4'b0000;

assign TxD_start = serial_start;

input clk;
output TxD;

input [31:0] word;
input send;
output busy;

reg [31:0] word_copy;

assign busy = (|mux_state);

always @(posedge clk)
begin
/*
case (mux_state)
4'b0000:
if (send)
begin
mux_state <= 4'b1000;
word_copy <= word;
end
4'b1000: out_byte <= word_copy[31:24];
4'b1010: out_byte <= word_copy[23:16];
4'b1100: out_byte <= word_copy[15:8];
4'b1110: out_byte <= word_copy[7:0];
default: mux_state <= 4'b0000;
endcase // case (mux_state)
*/

// Testing for busy is problematic if we are keeping the
// module busy all the time :-/ So we need some wait stages
// between the bytes.

if (!busy && send)
begin
mux_state <= 4'b1000;
word_copy <= word;
end

else if (mux_state[3] && ~mux_state[0] && !TxD_busy)
begin
serial_start <= 1;
mux_state <= mux_state + 1;

out_byte <= word_copy[31:24];
word_copy <= (word_copy << 8);
end

// wait stages
else if (mux_state[3] && mux_state[0])
begin
serial_start <= 0;
if (!TxD_busy) mux_state <= mux_state + 1;
end
end

async_transmitter serializer(.clk(clk), .TxD(TxD), .TxD_start(TxD_start), .TxD_data(out_byte), .TxD_busy(TxD_busy));
endmodule // serial_send
86 changes: 86 additions & 0 deletions cores/fpgaminer/rtl/sha-256-functions.v
@@ -0,0 +1,86 @@
/*
*
* Copyright (c) 2011 fpgaminer@bitcoin-mining.com
*
*
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*
*/


`timescale 1ns/1ps

module e0 (x, y);

input [31:0] x;
output [31:0] y;

assign y = {x[1:0],x[31:2]} ^ {x[12:0],x[31:13]} ^ {x[21:0],x[31:22]};

endmodule


module e1 (x, y);

input [31:0] x;
output [31:0] y;

assign y = {x[5:0],x[31:6]} ^ {x[10:0],x[31:11]} ^ {x[24:0],x[31:25]};

endmodule


module ch (x, y, z, o);

input [31:0] x, y, z;
output [31:0] o;

assign o = z ^ (x & (y ^ z));

endmodule


module maj (x, y, z, o);

input [31:0] x, y, z;
output [31:0] o;

assign o = (x & y) | (z & (x | y));

endmodule


module s0 (x, y);

input [31:0] x;
output [31:0] y;

assign y[31:29] = x[6:4] ^ x[17:15];
assign y[28:0] = {x[3:0], x[31:7]} ^ {x[14:0],x[31:18]} ^ x[31:3];

endmodule


module s1 (x, y);

input [31:0] x;
output [31:0] y;

assign y[31:22] = x[16:7] ^ x[18:9];
assign y[21:0] = {x[6:0],x[31:17]} ^ {x[8:0],x[31:19]} ^ x[31:10];

endmodule


162 changes: 162 additions & 0 deletions cores/fpgaminer/rtl/sha256_transform.v
@@ -0,0 +1,162 @@
/*
*
* Copyright (c) 2011 fpgaminer@bitcoin-mining.com
*
*
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*
*/


`timescale 1ns/1ps

// A quick define to help index 32-bit words inside a larger register.
`define IDX(x) (((x)+1)*(32)-1):((x)*(32))


// Perform a SHA-256 transformation on the given 512-bit data, and 256-bit
// initial state,
// Outputs one 256-bit hash every LOOP cycle(s).
//
// The LOOP parameter determines both the size and speed of this module.
// A value of 1 implies a fully unrolled SHA-256 calculation spanning 64 round
// modules and calculating a full SHA-256 hash every clock cycle. A value of
// 2 implies a half-unrolled loop, with 32 round modules and calculating
// a full hash in 2 clock cycles. And so forth.
module sha256_transform #(
parameter LOOP = 6'd4
) (
input clk,
input feedback,
input [5:0] cnt,
input [255:0] rx_state,
input [511:0] rx_input,
output reg [255:0] tx_hash
);

// Constants defined by the SHA-2 standard.
localparam Ks = {
32'h428a2f98, 32'h71374491, 32'hb5c0fbcf, 32'he9b5dba5,
32'h3956c25b, 32'h59f111f1, 32'h923f82a4, 32'hab1c5ed5,
32'hd807aa98, 32'h12835b01, 32'h243185be, 32'h550c7dc3,
32'h72be5d74, 32'h80deb1fe, 32'h9bdc06a7, 32'hc19bf174,
32'he49b69c1, 32'hefbe4786, 32'h0fc19dc6, 32'h240ca1cc,
32'h2de92c6f, 32'h4a7484aa, 32'h5cb0a9dc, 32'h76f988da,
32'h983e5152, 32'ha831c66d, 32'hb00327c8, 32'hbf597fc7,
32'hc6e00bf3, 32'hd5a79147, 32'h06ca6351, 32'h14292967,
32'h27b70a85, 32'h2e1b2138, 32'h4d2c6dfc, 32'h53380d13,
32'h650a7354, 32'h766a0abb, 32'h81c2c92e, 32'h92722c85,
32'ha2bfe8a1, 32'ha81a664b, 32'hc24b8b70, 32'hc76c51a3,
32'hd192e819, 32'hd6990624, 32'hf40e3585, 32'h106aa070,
32'h19a4c116, 32'h1e376c08, 32'h2748774c, 32'h34b0bcb5,
32'h391c0cb3, 32'h4ed8aa4a, 32'h5b9cca4f, 32'h682e6ff3,
32'h748f82ee, 32'h78a5636f, 32'h84c87814, 32'h8cc70208,
32'h90befffa, 32'ha4506ceb, 32'hbef9a3f7, 32'hc67178f2};


genvar i;

generate

for (i = 0; i < 64/LOOP; i = i + 1) begin : HASHERS
wire [511:0] W;
wire [255:0] state;

if(i == 0)
sha256_digester U (
.clk(clk),
.k(Ks[32*(63-cnt) +: 32]),
.rx_w(feedback ? W : rx_input),
.rx_state(feedback ? state : rx_state),
.tx_w(W),
.tx_state(state)
);
else
sha256_digester U (
.clk(clk),
.k(Ks[32*(63-LOOP*i-cnt) +: 32]),
.rx_w(feedback ? W : HASHERS[i-1].W),
.rx_state(feedback ? state : HASHERS[i-1].state),
.tx_w(W),
.tx_state(state)
);
end

endgenerate

always @ (posedge clk)
begin
if (!feedback)
begin
tx_hash[`IDX(0)] <= rx_state[`IDX(0)] + HASHERS[64/LOOP-6'd1].state[`IDX(0)];
tx_hash[`IDX(1)] <= rx_state[`IDX(1)] + HASHERS[64/LOOP-6'd1].state[`IDX(1)];
tx_hash[`IDX(2)] <= rx_state[`IDX(2)] + HASHERS[64/LOOP-6'd1].state[`IDX(2)];
tx_hash[`IDX(3)] <= rx_state[`IDX(3)] + HASHERS[64/LOOP-6'd1].state[`IDX(3)];
tx_hash[`IDX(4)] <= rx_state[`IDX(4)] + HASHERS[64/LOOP-6'd1].state[`IDX(4)];
tx_hash[`IDX(5)] <= rx_state[`IDX(5)] + HASHERS[64/LOOP-6'd1].state[`IDX(5)];
tx_hash[`IDX(6)] <= rx_state[`IDX(6)] + HASHERS[64/LOOP-6'd1].state[`IDX(6)];
tx_hash[`IDX(7)] <= rx_state[`IDX(7)] + HASHERS[64/LOOP-6'd1].state[`IDX(7)];
end
end


endmodule


module sha256_digester (clk, k, rx_w, rx_state, tx_w, tx_state);

input clk;
input [31:0] k;
input [511:0] rx_w;
input [255:0] rx_state;

output reg [511:0] tx_w;
output reg [255:0] tx_state;


wire [31:0] e0_w, e1_w, ch_w, maj_w, s0_w, s1_w;


e0 e0_blk (rx_state[`IDX(0)], e0_w);
e1 e1_blk (rx_state[`IDX(4)], e1_w);
ch ch_blk (rx_state[`IDX(4)], rx_state[`IDX(5)], rx_state[`IDX(6)], ch_w);
maj maj_blk (rx_state[`IDX(0)], rx_state[`IDX(1)], rx_state[`IDX(2)], maj_w);
s0 s0_blk (rx_w[63:32], s0_w);
s1 s1_blk (rx_w[479:448], s1_w);

wire [31:0] t1 = rx_state[`IDX(7)] + e1_w + ch_w + rx_w[31:0] + k;
wire [31:0] t2 = e0_w + maj_w;
wire [31:0] new_w = s1_w + rx_w[319:288] + s0_w + rx_w[31:0];


always @ (posedge clk)
begin
tx_w[511:480] <= new_w;
tx_w[479:0] <= rx_w[511:32];

tx_state[`IDX(7)] <= rx_state[`IDX(6)];
tx_state[`IDX(6)] <= rx_state[`IDX(5)];
tx_state[`IDX(5)] <= rx_state[`IDX(4)];
tx_state[`IDX(4)] <= rx_state[`IDX(3)] + t1;
tx_state[`IDX(3)] <= rx_state[`IDX(2)];
tx_state[`IDX(2)] <= rx_state[`IDX(1)];
tx_state[`IDX(1)] <= rx_state[`IDX(0)];
tx_state[`IDX(0)] <= t1 + t2;
end

endmodule



12 changes: 12 additions & 0 deletions tools/Makefile
@@ -0,0 +1,12 @@
TARGETS=byteswap
CC=gcc

all: $(TARGETS)

%: %.c
$(CC) -O2 -Wall -I. -s -o $@ $<

.PHONY: clean install

clean:
rm -f $(TARGETS) *.o
60 changes: 60 additions & 0 deletions tools/byteswap.c
@@ -0,0 +1,60 @@
/*
* Milkymist SoC
* Copyright (C) 2007, 2008, 2009, 2010 Sebastien Bourdeauducq
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, version 3 of the License.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/

#include <stdlib.h>
#include <string.h>
#include <stdio.h>

int main(int argc, char *argv[])
{
FILE *fdi, *fdo;
unsigned short wi;
unsigned short wo;
int i;

if(argc != 3) {
fprintf(stderr, "Usage: byteswap <infile> <outfile>\n");
return 1;
}
fdi = fopen(argv[1], "rb");
if(!fdi) {
perror("Unable to open input file");
return 1;
}
fdo = fopen(argv[2], "w");
if(!fdo) {
perror("Unable to open output file");
fclose(fdi);
return 1;
}
while(1) {
if(fread(&wi, 2, 1, fdi) <= 0) break;
wo = 0;
for(i=0;i<16;i++)
if(wi & (1 << i))
wo |= (0x8000 >> i);
/* comment out the next line on big endian machines! */
wo = ((wo & 0x00ff) << 8) | ((wo & 0xff00) >> 8);
fwrite(&wo, 2, 1, fdo);
}
fclose(fdi);
if(fclose(fdo) != 0) {
perror("Unable to close output file");
return 1;
}
return 0;
}